Bayesian Hackers chapter 1 using pyro

wataruhashimoto52 · November 20, 2017, 12:25am

I am currently trying to write “Bayesian Hackers” using pyro.

import torch
from torch.autograd import Variable
import pyro
import pyro.distributions as dist
from pyro.optim import Adam
from pyro.infer import SVI
%matplotlib inline
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt

pyro.clear_param_store()

figsize(12.5, 3.5)
count_data = np.loadtxt("data/txtdata.csv")
n_count_data = len(count_data)
plt.bar(np.arange(n_count_data), count_data, color="#348ABD")
plt.xlabel("Time (days)")
plt.ylabel("count of text-msgs received")
plt.title("Did the user's texting habits change over time?")
plt.xlim(0, n_count_data);

alpha_f = 1.0/count_data.mean()

def lambda_(tau, lambda_1, lambda_2):
    out = Variable(torch.zeros(n_count_data))
    tau = int(tau.data.numpy())
    for i in range(0, len(out[:tau])):
        out[i] = lambda_1
    for j in range(len(out[:tau]),n_count_data):
        out[j] = lambda_2
    return out

def model(count_data):
    alpha = Variable(torch.Tensor([alpha_f]))
    lambda_1 = pyro.sample("lambda_1", dist.exponential, alpha)
    lambda_2 = pyro.sample("lambda_2" ,dist.exponential, alpha)
    tau = pyro.sample("tau", dist.uniform, Variable(torch.IntTensor([0])), Variable(torch.IntTensor([n_count_data])))
    f = pyro.sample("latent", dist.poisson, lambda_(tau, lambda_1, lambda_2))
    for i in range(n_count_data):
        pyro.observe("obs_{}".format(i), dist.poisson,
                    count_data[i], f)

def guide(count_data):
    log_lambda1_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
    log_lambda2_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
    log_tau_q_0 = Variable(torch.IntTensor([np.log(36.0)]), requires_grad=True)
    
    log_lambda1_q = pyro.param("log_lambda1_q", log_lambda1_q_0)
    log_lambda2_q = pyro.param("log_lambda2_q", log_lambda2_q_0)
    log_tau_q = pyro.param("log_tau_q", log_tau_q_0)
    
    lambda1_q = torch.exp(log_lambda1_q)
    lambda2_q = torch.exp(log_lambda2_q)
    tau_q = torch.exp(log_tau_q)
    
    pyro.sample("latent", dist.poisson, lambda_(tau_q, lambda1_q, lambda2_q))
    
adam_params = {"lr":0.0005, "betas":(0.90, 0.999)}
optimizer = Adam(adam_params)

n_steps=4000
svi = SVI(model, guide, optimizer, loss="ELBO")

for step in range(n_steps):
    svi.step(count_data)
    if step % 100 == 0:
        print('.', end='')

But this got errors.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-102-c0cac78536c6> in <module>()
     65 
     66 for step in range(n_steps):
---> 67     svi.step(count_data)
     68     if step % 100 == 0:
     69         print('.', end='')

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/svi.py in step(self, *args, **kwargs)
     96         """
     97         # get loss and compute gradients
---> 98         loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
     99 
    100         # get active params

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/elbo.py in loss_and_grads(self, model, guide, *args, **kwargs)
     63         :rtype: float
     64         """
---> 65         return self.which_elbo.loss_and_grads(model, guide, *args, **kwargs)

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/trace_elbo.py in loss_and_grads(self, model, guide, *args, **kwargs)
    133         trainable_params = set()
    134         # grab a trace from the generator
--> 135         for weight, model_trace, guide_trace, log_r in self._get_traces(model, guide, *args, **kwargs):
    136             elbo_particle = weight * 0
    137             surrogate_elbo_particle = weight * 0

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/infer/trace_elbo.py in _get_traces(self, model, guide, *args, **kwargs)
     78                 continue
     79 
---> 80             guide_trace = poutine.trace(guide).get_trace(*args, **kwargs)
     81             model_trace = poutine.trace(poutine.replay(model, guide_trace)).get_trace(*args, **kwargs)
     82 

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/poutine/trace_poutine.py in get_trace(self, *args, **kwargs)
    161         Calls this poutine and returns its trace instead of the function's return value.
    162         """
--> 163         self(*args, **kwargs)
    164         return self.trace.copy()
    165 

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/poutine/trace_poutine.py in __call__(self, *args, **kwargs)
    149                             name="_INPUT", type="args",
    150                             args=args, kwargs=kwargs)
--> 151         ret = super(TracePoutine, self).__call__(*args, **kwargs)
    152         self.trace.add_node("_RETURN", name="_RETURN", type="return", value=ret)
    153         return ret

~/.pyenv/versions/3.5.2/lib/python3.5/site-packages/pyro/poutine/poutine.py in __call__(self, *args, **kwargs)
     40         """
     41         with self:
---> 42             return self.fn(*args, **kwargs)
     43 
     44     def __enter__(self):

<ipython-input-102-c0cac78536c6> in guide(count_data)
     46     log_lambda1_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
     47     log_lambda2_q_0 = Variable(torch.Tensor([np.log(15.0)]),requires_grad=True)
---> 48     log_tau_q_0 = Variable(torch.IntTensor([np.log(36.0)]), requires_grad=True)
     49 
     50     log_lambda1_q = pyro.param("log_lambda1_q", log_lambda1_q_0)

RuntimeError: tried to construct a tensor from a int sequence, but found an item of type numpy.float64 at index (0)

I don’t know what to do, so I’m happy if anyone will fix this program.
And also, is this definition of a deterministic function correct?

fritzo · November 20, 2017, 12:38am

Do you have a reason for using IntTensor rather than Tensor? It looks like you could simply

- log_tau_q_0 = Variable(torch.IntTensor([np.log(36.0)]), requires_grad=True)
+ log_tau_q_0 = Variable(torch.Tensor([np.log(36.0)]), requires_grad=True)

taitlarson · December 14, 2017, 8:23pm

If you ever get this to work I’d love to read it!

hongseok · January 12, 2018, 2:51pm

Here is some modified version that compiles and runs. But I feel that svi doesn’t give a right result. Is there anyone who can fix this version and make svi give a right answer?


import torch
from torch.autograd import Variable

import pyro
import pyro.distributions as dist
from pyro.optim import Adam
from pyro.infer import SVI

import matplotlib
from matplotlib import pyplot as plt
import numpy as np

pyro.clear_param_store()
count_data = np.loadtxt("data/txtdata.csv")

alpha_f = 1.0/count_data.mean()

def model(count_data):
    n_count_data = len(count_data)
    alpha = Variable(torch.Tensor([alpha_f]))
    lambda1 = pyro.sample("lambda1", dist.exponential, alpha)
    lambda2 = pyro.sample("lambda2", dist.exponential, alpha)
    tau = pyro.sample("tau", dist.categorical, ps=Variable(torch.ones(n_count_data+1)), one_hot=False)
    for i in range(n_count_data):
        if (i < tau.data.numpy()[0]):
            pyro.observe("obs_{}".format(i), 
                         dist.poisson, 
                         Variable(torch.Tensor([count_data[i]])),
                         lambda1)
        else:
            pyro.observe("obs_{}".format(i),
                        dist.poisson,
                        Variable(torch.Tensor([count_data[i]])),
                        lambda2)

def guide(count_data):
    n_count_data = len(count_data)
    log_lambda1_q0 = Variable(torch.Tensor([np.log(alpha_f)]), requires_grad=True)
    log_lambda2_q0 = Variable(torch.Tensor([np.log(alpha_f)]), requires_grad=True)
    log_lambda1_q = pyro.param("log_lambda1_q", log_lambda1_q0)
    log_lambda2_q = pyro.param("log_lambda2_q", log_lambda2_q0)
    lambda1_q = pyro.sample("lambda1", dist.exponential, torch.exp(log_lambda1_q))
    lambda2_q = pyro.sample("lambda2", dist.exponential, torch.exp(log_lambda2_q))
  
    log_ps0 = Variable(torch.zeros(n_count_data+1), requires_grad=True)
    log_ps = pyro.param("log_tau_ps", log_ps0)
    ps = torch.exp(log_ps)
    normalised_ps = ps / torch.sum(ps)
    tau_q = pyro.sample("tau", dist.categorical, ps=normalised_ps, one_hot=False) 
    
adam_params = {"lr": 0.0005}
optimizer = Adam(adam_params)

n_steps = 200000
svi = SVI(model, guide, optimizer, loss="ELBO")

for step in range(n_steps):
    svi.step(count_data)
    if step % 100 == 0:
        param_store = pyro.get_param_store()
        log_tau_ps = param_store.get_param("log_tau_ps")
        i = np.argmax(log_tau_ps.data.numpy())
        print("[", step, ":", svi.evaluate_loss(count_data), ":", i, "]")
print("done")

param_store = pyro.get_param_store()
print(1/torch.exp(param_store.get_param("log_lambda1_q")))
print(1/torch.exp(param_store.get_param("log_lambda2_q")))
tau_ps = torch.exp(param_store.get_param("log_tau_ps"))
print(tau_ps / torch.sum(tau_ps))

vincent · January 31, 2018, 3:31pm

Did you get this to work?

hongseok · January 31, 2018, 5:19pm

No, unfortunately. It would be nice if there is some tool that can help me to see what goes on in pyro’s inference algorithm for each example.

martinjankowiak · January 31, 2018, 11:01pm

i haven’t looked at your code in precise detail, but one problem you’re likely running into is that SVI just isn’t going to be great at doing inference in this kind of model. this is because you have a discrete random variable (in particular one that takes on 70 values). the gradient estimator for this kind of model/guide pair will have very high variance, which will make learning hard.

your best bet would probably be to change the model a little bit and instead do a continuous relaxation of the problem. in particular imagine modeling the change point smoothly instead of with a discrete jump. if you set things up right, you should be able to have a model/guide pair where all the latent random variables are reparameterizable. then you’ll get a much lower variance gradient estimator and learning should proceed much more smoothly. the final answer won’t be identical to the one obtained in the other model, but it should be pretty close.

jvans · May 10, 2018, 2:54pm

Hi Martin, can you elaborate on this? Why will this type of model cause high variance in the model/guide? Does this indicate that SVI isn’t the appropriate way to approach this problem, would MCMC be a better choice? Happy to read any resources you might be able to point me to that would help cc: @tiger

martinjankowiak · May 10, 2018, 4:18pm

well this is something thats generally true when the so-called score function gradient estimator is used to differentiate expectations w.r.t. discrete random variables (in this case differentiating the elbo).

there is some discussion of this here:
http://pyro.ai/examples/svi_part_iii.html

here’s a recent paper that tries to address this problem:

reading the background there might also be helpful (although it’s probably dense)

jvans · June 2, 2018, 8:45pm

How do you go about setting up your model guide so that the random variables are reparameterizable? Does that mean simply swapping out discrete distributions, for one of the reparameterizable distributions mentioned in this paper? What are the appropriate debugging steps to take when the model isn’t training? cc @tiger

martinjankowiak · June 2, 2018, 10:30pm

yes basically. use any pytorch distribution that has a rsample method.

you could then define an intensity function lambda that depends on 1 or 2 random variables (for example a beta variate whose domain is scaled to the total time range and whose value determines the changepoint).

you could also do a version in which the intensity function depends on 1 or 2 parameters and no random variables (so you’re doing MLE).

tiger · June 10, 2018, 8:30pm

So @jvans and I have been reading up on and attempting to implement your recommendations but so far nothing has stuck. We’re hoping some concrete code might help at this point. We have gotten our model and guide to look like:

alpha_f = 1.0 / count_data.mean()
alpha = torch.tensor(alpha_f)

def model(count_data):
    mu1 = pyro.sample("mu1", dist.Exponential(alpha))
    sigma1 = pyro.sample("sigma1", dist.Exponential(torch.tensor(0.1)))
    
    mu2 = pyro.sample("mu2", dist.Exponential(alpha))
    sigma2 = pyro.sample("sigma2", dist.Exponential(torch.tensor(0.1)))
    
    switch_point = pyro.sample("switch_point", dist.Beta(torch.tensor(10.), torch.tensor(10.))) * torch.tensor(float(len(count_data)))
    for i in range(len(count_data)):
        if switch_point > i:
            pyro.sample("obs_{}".format(i), dist.Normal(mu1, sigma1), obs=torch.tensor(count_data[i]))
        else:
            pyro.sample("obs_{}".format(i), dist.Normal(mu2, sigma2), obs=torch.tensor(count_data[i]))

def guide(count_data):
    mu1_param = pyro.param("mu1_param", alpha, constraint=constraints.positive)
    mu1 = pyro.sample("mu1", dist.Exponential(mu1_param))
    
    sigma1_param = pyro.param("sigma1_param", torch.tensor(0.1), constraint=constraints.positive)
    sigma1 = pyro.sample("sigma1", dist.Exponential(sigma1_param))
    
    mu2_param = pyro.param("mu2_param", alpha, constraint=constraints.positive)
    mu2 = pyro.sample("mu2", dist.Exponential(mu2_param))
    
    sigma2_param = pyro.param("sigma2_param", torch.tensor(0.1), constraint=constraints.positive)
    sigma2 = pyro.sample("sigma2", dist.Exponential(sigma2_param))
    
    alpha_param = pyro.param("alpha", torch.tensor(10.))
    beta_param = pyro.param("beta", torch.tensor(10.))
    
    pyro.sample("switch_point", dist.Beta(alpha_param, beta_param))

Unfortunately, we’re getting parameters that can’t make their mind up .

[ 0 loss: 727.5302200317383 switch point: tensor(27.7731) ]
[ 500 loss: 1935.1901988983154 switch point: tensor(22.6702) ]
[ 1000 loss: 385.3749694824219 switch point: tensor(17.7749) ]
[ 1500 loss: 359.69604778289795 switch point: tensor(11.2152) ]
[ 2000 loss: 327.10622215270996 switch point: tensor(22.0613) ]

As far as I can tell we are using all reparameterizable distributions per Martin’s suggestion. We are vaguely aware of ELBO gradient estimators having high variance, but are unsure exactly how to debug/fix this. Besides that, the model isn’t even moving in the right general direction (we expect it to converge to a switch point ~ 38).

One thing I’d like to explore but am unsure exactly how to is the “continuous relaxation” of the switch point. The pseudocode for this goes something like:

for i in range(len(count_data)):
    distance_from_switch_point = (switch_point - i) / len(count_data)
    mu = mu1 * distance_from_switch_point + mu2 * (1 - distance_from_switch_point)
    sigma = sigma1 * distance_from_switch_point + sigma2 * (1 - distance_from_switch_point)
      
    pyro.sample("obs_{}".format(i), dist.Normal(mu, sigma), obs=count_data[i])

Hoping someone can spot whatever’s glaringly wrong with our current implementation thanks for bearing with us.

fritzo · June 11, 2018, 5:19pm

One trick that might help reduce gradient variance is to enumerate over switch_point. To do this you can modify the guide to use a full categorical distribution

def guide(count_data):
    ...
    switch_probs = pyro.param("switch_probs", torch.ones(1 + len(count_data)) / (1 + len(count_data)),
                              constraint=constraints.simplex)
    pyro.sample("switch_point", dist.Categorical(switch_probs),
                infer={'enumerate': 'parallel'})

and then use TraceEnum_ELBO to enable parallel enumeration

svi = SVI(model, guide, Adam({}), TracEnum_ELBO(max_iarange_nesting=0))

If you really want the Beta distribution parameters, you can define switch_probs using dist.Beta(...).cdf(my_uniform_grid), but I’d try using Categorical directly.

tiger · June 11, 2018, 6:37pm

Hmm interesting @fritzo we’ve tried modeling it with Categorical before but without some of the nuances in your code. Will report back with results.

martinjankowiak · June 14, 2018, 6:04pm

@tiger for a relaxed version i meant something like this (there are many ways to do this but here is one simple possibility):

def intensity_function(i, switch_point, mu1, mu2, sigma1, sigma2):
    sigmoid = torch.sigmoid(float(i) - switch_point)
    mu = (1.0 - sigmoid) * mu1 + mu2 * sigmoid                
    sigma = (1.0 - sigmoid) * sigma1 + sigma2 * sigmoid
    return mu, sigma

def model(count_data):
    mu1 = pyro.sample("mu1", dist.Exponential(alpha))
    sigma1 = pyro.sample("sigma1", dist.Exponential(torch.tensor(0.1)))
    
    mu2 = pyro.sample("mu2", dist.Exponential(alpha))
    sigma2 = pyro.sample("sigma2", dist.Exponential(torch.tensor(0.1)))
    
    switch_point = pyro.sample("switch_point", dist.Beta(torch.tensor(10.), torch.tensor(10.))) * torch.tensor(float(len(count_data)))
    for i in range(len(count_data)):
        mu_i, sigma_i = intensity_function(i, switch_point, mu1, mu2, sigma1, sigma2)
        pyro.sample("obs_{}".format(i), dist.Normal(mu_i, sigma_i), obs=torch.tensor(count_data[i]))

you have to be careful if you do things like if switch_point > i: because that takes you off the autodiff graph (hard switches aren’t differentiable)

nelsongriffiths · August 23, 2019, 2:17pm

@tiger did you ever find a full working solution to this problem? I am trying to go through the Bayesian Hackers book in pyro instead of pymc3 but I am having no luck. It would be cool to learn from your working solution if you found one.

odats · October 20, 2020, 2:30pm

Can you please provide code for the model? I have tried a different implementation but I can not make it work with: {'enumerate': 'parallel'}