Bayesian Hackers chapter 1 using pyro


#1

I am currently trying to write “Bayesian Hackers” using pyro.

import torch
from torch.autograd import Variable
import pyro
import pyro.distributions as dist
from pyro.optim import Adam
from pyro.infer import SVI
%matplotlib inline
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt

pyro.clear_param_store()

figsize(12.5, 3.5)
count_data = np.loadtxt("data/txtdata.csv")
n_count_data = len(count_data)
plt.bar(np.arange(n_count_data), count_data, color="#348ABD")
plt.xlabel("Time (days)")
plt.ylabel("count of text-msgs received")
plt.title("Did the user's texting habits change over time?")
plt.xlim(0, n_count_data);

alpha_f = 1.0/count_data.mean()

def lambda_(tau, lambda_1, lambda_2):
    out = Variable(torch.zeros(n_count_data))
    tau = int(tau.data.numpy())
    for i in range(0, len(out[:tau])):
        out[i] = lambda_1
    for j in range(len(out[:tau]),n_count_data):
        out[j] = lambda_2
    return out

def model(count_data):
    alpha = Variable(torch.Tensor([alpha_f]))
    lambda_1 = pyro.sample("lambda_1", dist.exponential, alpha)
    lambda_2 = pyro.sample("lambda_2" ,dist.exponential, alpha)
    tau = pyro.sample("tau", dist.uniform, Variable(torch.IntTensor([0])), Variable(torch.IntTensor([n_count_data])))
    f = pyro.sample("latent", dist.poisson, lambda_(tau, lambda_1, lambda_2))
    for i in range(n_count_data):
        pyro.observe("obs_{}".format(i), dist.poisson,
                    count_data[i], f)

def guide(count_data):
    log_lambda1_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
    log_lambda2_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
    log_tau_q_0 = Variable(torch.IntTensor([np.log(36.0)]), requires_grad=True)
    
    log_lambda1_q = pyro.param("log_lambda1_q", log_lambda1_q_0)
    log_lambda2_q = pyro.param("log_lambda2_q", log_lambda2_q_0)
    log_tau_q = pyro.param("log_tau_q", log_tau_q_0)
    
    lambda1_q = torch.exp(log_lambda1_q)
    lambda2_q = torch.exp(log_lambda2_q)
    tau_q = torch.exp(log_tau_q)
    
    pyro.sample("latent", dist.poisson, lambda_(tau_q, lambda1_q, lambda2_q))
    
adam_params = {"lr":0.0005, "betas":(0.90, 0.999)}
optimizer = Adam(adam_params)

n_steps=4000
svi = SVI(model, guide, optimizer, loss="ELBO")

for step in range(n_steps):
    svi.step(count_data)
    if step % 100 == 0:
        print('.', end='')

But this got errors.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-102-c0cac78536c6> in <module>()
     65 
     66 for step in range(n_steps):
---> 67     svi.step(count_data)
     68     if step % 100 == 0:
     69         print('.', end='')

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/svi.py in step(self, *args, **kwargs)
     96         """
     97         # get loss and compute gradients
---> 98         loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
     99 
    100         # get active params

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/elbo.py in loss_and_grads(self, model, guide, *args, **kwargs)
     63         :rtype: float
     64         """
---> 65         return self.which_elbo.loss_and_grads(model, guide, *args, **kwargs)

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/trace_elbo.py in loss_and_grads(self, model, guide, *args, **kwargs)
    133         trainable_params = set()
    134         # grab a trace from the generator
--> 135         for weight, model_trace, guide_trace, log_r in self._get_traces(model, guide, *args, **kwargs):
    136             elbo_particle = weight * 0
    137             surrogate_elbo_particle = weight * 0

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/trace_elbo.py in _get_traces(self, model, guide, *args, **kwargs)
     78                 continue
     79 
---> 80             guide_trace = poutine.trace(guide).get_trace(*args, **kwargs)
     81             model_trace = poutine.trace(poutine.replay(model, guide_trace)).get_trace(*args, **kwargs)
     82 

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/poutine/trace_poutine.py in get_trace(self, *args, **kwargs)
    161         Calls this poutine and returns its trace instead of the function's return value.
    162         """
--> 163         self(*args, **kwargs)
    164         return self.trace.copy()
    165 

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/poutine/trace_poutine.py in __call__(self, *args, **kwargs)
    149                             name="_INPUT", type="args",
    150                             args=args, kwargs=kwargs)
--> 151         ret = super(TracePoutine, self).__call__(*args, **kwargs)
    152         self.trace.add_node("_RETURN", name="_RETURN", type="return", value=ret)
    153         return ret

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/poutine/poutine.py in __call__(self, *args, **kwargs)
     40         """
     41         with self:
---> 42             return self.fn(*args, **kwargs)
     43 
     44     def __enter__(self):

<ipython-input-102-c0cac78536c6> in guide(count_data)
     46     log_lambda1_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
     47     log_lambda2_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
---> 48     log_tau_q_0 = Variable(torch.IntTensor([np.log(36.0)]), requires_grad=True)
     49 
     50     log_lambda1_q = pyro.param("log_lambda1_q", log_lambda1_q_0)

RuntimeError: tried to construct a tensor from a int sequence, but found an item of type numpy.float64 at index (0)

I don’t know what to do, so I’m happy if anyone will fix this program.
And also, is this definition of a deterministic function correct?


#2

Do you have a reason for using IntTensor rather than Tensor? It looks like you could simply

- log_tau_q_0 = Variable(torch.IntTensor([np.log(36.0)]), requires_grad=True)
+ log_tau_q_0 = Variable(torch.Tensor([np.log(36.0)]), requires_grad=True)

#3

If you ever get this to work I’d love to read it!


#4

Here is some modified version that compiles and runs. But I feel that svi doesn’t give a right result. Is there anyone who can fix this version and make svi give a right answer?



import torch
from torch.autograd import Variable

import pyro
import pyro.distributions as dist
from pyro.optim import Adam
from pyro.infer import SVI

import matplotlib
from matplotlib import pyplot as plt
import numpy as np

pyro.clear_param_store()
count_data = np.loadtxt("data/txtdata.csv")

alpha_f = 1.0/count_data.mean()

def model(count_data):
    n_count_data = len(count_data)
    alpha = Variable(torch.Tensor([alpha_f]))
    lambda1 = pyro.sample("lambda1", dist.exponential, alpha)
    lambda2 = pyro.sample("lambda2", dist.exponential, alpha)
    tau = pyro.sample("tau", dist.categorical, ps=Variable(torch.ones(n_count_data+1)), one_hot=False)
    for i in range(n_count_data):
        if (i < tau.data.numpy()[0]):
            pyro.observe("obs_{}".format(i), 
                         dist.poisson, 
                         Variable(torch.Tensor([count_data[i]])),
                         lambda1)
        else:
            pyro.observe("obs_{}".format(i),
                        dist.poisson,
                        Variable(torch.Tensor([count_data[i]])),
                        lambda2)

def guide(count_data):
    n_count_data = len(count_data)
    log_lambda1_q0 = Variable(torch.Tensor([np.log(alpha_f)]), requires_grad=True)
    log_lambda2_q0 = Variable(torch.Tensor([np.log(alpha_f)]), requires_grad=True)
    log_lambda1_q = pyro.param("log_lambda1_q", log_lambda1_q0)
    log_lambda2_q = pyro.param("log_lambda2_q", log_lambda2_q0)
    lambda1_q = pyro.sample("lambda1", dist.exponential, torch.exp(log_lambda1_q))
    lambda2_q = pyro.sample("lambda2", dist.exponential, torch.exp(log_lambda2_q))
  
    log_ps0 = Variable(torch.zeros(n_count_data+1), requires_grad=True)
    log_ps = pyro.param("log_tau_ps", log_ps0)
    ps = torch.exp(log_ps)
    normalised_ps = ps / torch.sum(ps)
    tau_q = pyro.sample("tau", dist.categorical, ps=normalised_ps, one_hot=False) 
    
adam_params = {"lr": 0.0005}
optimizer = Adam(adam_params)

n_steps = 200000
svi = SVI(model, guide, optimizer, loss="ELBO")

for step in range(n_steps):
    svi.step(count_data)
    if step % 100 == 0:
        param_store = pyro.get_param_store()
        log_tau_ps = param_store.get_param("log_tau_ps")
        i = np.argmax(log_tau_ps.data.numpy())
        print("[", step, ":", svi.evaluate_loss(count_data), ":", i, "]")
print("done")

param_store = pyro.get_param_store()
print(1/torch.exp(param_store.get_param("log_lambda1_q")))
print(1/torch.exp(param_store.get_param("log_lambda2_q")))
tau_ps = torch.exp(param_store.get_param("log_tau_ps"))
print(tau_ps / torch.sum(tau_ps))

#5

Did you get this to work?


#6

No, unfortunately. It would be nice if there is some tool that can help me to see what goes on in pyro’s inference algorithm for each example.


#7

i haven’t looked at your code in precise detail, but one problem you’re likely running into is that SVI just isn’t going to be great at doing inference in this kind of model. this is because you have a discrete random variable (in particular one that takes on 70 values). the gradient estimator for this kind of model/guide pair will have very high variance, which will make learning hard.

your best bet would probably be to change the model a little bit and instead do a continuous relaxation of the problem. in particular imagine modeling the change point smoothly instead of with a discrete jump. if you set things up right, you should be able to have a model/guide pair where all the latent random variables are reparameterizable. then you’ll get a much lower variance gradient estimator and learning should proceed much more smoothly. the final answer won’t be identical to the one obtained in the other model, but it should be pretty close.


#8

Hi Martin, can you elaborate on this? Why will this type of model cause high variance in the model/guide? Does this indicate that SVI isn’t the appropriate way to approach this problem, would MCMC be a better choice? Happy to read any resources you might be able to point me to that would help :slight_smile: cc: @tiger


#9

well this is something thats generally true when the so-called score function gradient estimator is used to differentiate expectations w.r.t. discrete random variables (in this case differentiating the elbo).

there is some discussion of this here:
http://pyro.ai/examples/svi_part_iii.html

here’s a recent paper that tries to address this problem:


reading the background there might also be helpful (although it’s probably dense)


#10

How do you go about setting up your model guide so that the random variables are reparameterizable? Does that mean simply swapping out discrete distributions, for one of the reparameterizable distributions mentioned in this paper? What are the appropriate debugging steps to take when the model isn’t training? cc @tiger


#11

yes basically. use any pytorch distribution that has a rsample method.

you could then define an intensity function lambda that depends on 1 or 2 random variables (for example a beta variate whose domain is scaled to the total time range and whose value determines the changepoint).

you could also do a version in which the intensity function depends on 1 or 2 parameters and no random variables (so you’re doing MLE).


#12

So @jvans and I have been reading up on and attempting to implement your recommendations but so far nothing has stuck. We’re hoping some concrete code might help at this point. We have gotten our model and guide to look like:

alpha_f = 1.0 / count_data.mean()
alpha = torch.tensor(alpha_f)

def model(count_data):
    mu1 = pyro.sample("mu1", dist.Exponential(alpha))
    sigma1 = pyro.sample("sigma1", dist.Exponential(torch.tensor(0.1)))
    
    mu2 = pyro.sample("mu2", dist.Exponential(alpha))
    sigma2 = pyro.sample("sigma2", dist.Exponential(torch.tensor(0.1)))
    
    switch_point = pyro.sample("switch_point", dist.Beta(torch.tensor(10.), torch.tensor(10.))) * torch.tensor(float(len(count_data)))
    for i in range(len(count_data)):
        if switch_point > i:
            pyro.sample("obs_{}".format(i), dist.Normal(mu1, sigma1), obs=torch.tensor(count_data[i]))
        else:
            pyro.sample("obs_{}".format(i), dist.Normal(mu2, sigma2), obs=torch.tensor(count_data[i]))

def guide(count_data):
    mu1_param = pyro.param("mu1_param", alpha, constraint=constraints.positive)
    mu1 = pyro.sample("mu1", dist.Exponential(mu1_param))
    
    sigma1_param = pyro.param("sigma1_param", torch.tensor(0.1), constraint=constraints.positive)
    sigma1 = pyro.sample("sigma1", dist.Exponential(sigma1_param))
    
    mu2_param = pyro.param("mu2_param", alpha, constraint=constraints.positive)
    mu2 = pyro.sample("mu2", dist.Exponential(mu2_param))
    
    sigma2_param = pyro.param("sigma2_param", torch.tensor(0.1), constraint=constraints.positive)
    sigma2 = pyro.sample("sigma2", dist.Exponential(sigma2_param))
    
    alpha_param = pyro.param("alpha", torch.tensor(10.))
    beta_param = pyro.param("beta", torch.tensor(10.))
    
    pyro.sample("switch_point", dist.Beta(alpha_param, beta_param))

Unfortunately, we’re getting parameters that can’t make their mind up :confused:.

[ 0 loss: 727.5302200317383 switch point: tensor(27.7731) ]
[ 500 loss: 1935.1901988983154 switch point: tensor(22.6702) ]
[ 1000 loss: 385.3749694824219 switch point: tensor(17.7749) ]
[ 1500 loss: 359.69604778289795 switch point: tensor(11.2152) ]
[ 2000 loss: 327.10622215270996 switch point: tensor(22.0613) ]

As far as I can tell we are using all reparameterizable distributions per Martin’s suggestion. We are vaguely aware of ELBO gradient estimators having high variance, but are unsure exactly how to debug/fix this. Besides that, the model isn’t even moving in the right general direction (we expect it to converge to a switch point ~ 38).

One thing I’d like to explore but am unsure exactly how to is the “continuous relaxation” of the switch point. The pseudocode for this goes something like:

for i in range(len(count_data)):
    distance_from_switch_point = (switch_point - i) / len(count_data)
    mu = mu1 * distance_from_switch_point + mu2 * (1 - distance_from_switch_point)
    sigma = sigma1 * distance_from_switch_point + sigma2 * (1 - distance_from_switch_point)
      
    pyro.sample("obs_{}".format(i), dist.Normal(mu, sigma), obs=count_data[i])

Hoping someone can spot whatever’s glaringly wrong with our current implementation :grimacing:thanks for bearing with us.


#13

One trick that might help reduce gradient variance is to enumerate over switch_point. To do this you can modify the guide to use a full categorical distribution

def guide(count_data):
    ...
    switch_probs = pyro.param("switch_probs", torch.ones(1 + len(count_data)) / (1 + len(count_data)),
                              constraint=constraints.simplex)
    pyro.sample("switch_point", dist.Categorical(switch_probs),
                infer={'enumerate': 'parallel'})

and then use TraceEnum_ELBO to enable parallel enumeration

svi = SVI(model, guide, Adam({}), TracEnum_ELBO(max_iarange_nesting=0))

If you really want the Beta distribution parameters, you can define switch_probs using dist.Beta(...).cdf(my_uniform_grid), but I’d try using Categorical directly.


#14

Hmm interesting @fritzo we’ve tried modeling it with Categorical before but without some of the nuances in your code. Will report back with results.


#15

@tiger for a relaxed version i meant something like this (there are many ways to do this but here is one simple possibility):

def intensity_function(i, switch_point, mu1, mu2, sigma1, sigma2):
    sigmoid = torch.sigmoid(float(i) - switch_point)
    mu = (1.0 - sigmoid) * mu1 + mu2 * sigmoid                
    sigma = (1.0 - sigmoid) * sigma1 + sigma2 * sigmoid
    return mu, sigma

def model(count_data):
    mu1 = pyro.sample("mu1", dist.Exponential(alpha))
    sigma1 = pyro.sample("sigma1", dist.Exponential(torch.tensor(0.1)))
    
    mu2 = pyro.sample("mu2", dist.Exponential(alpha))
    sigma2 = pyro.sample("sigma2", dist.Exponential(torch.tensor(0.1)))
    
    switch_point = pyro.sample("switch_point", dist.Beta(torch.tensor(10.), torch.tensor(10.))) * torch.tensor(float(len(count_data)))
    for i in range(len(count_data)):
        mu_i, sigma_i = intensity_function(i, switch_point, mu1, mu2, sigma1, sigma2)
        pyro.sample("obs_{}".format(i), dist.Normal(mu_i, sigma_i), obs=torch.tensor(count_data[i])) 

you have to be careful if you do things like if switch_point > i: because that takes you off the autodiff graph (hard switches aren’t differentiable)