How does pyro.random_module match priors with RegressionModel parameters?

medcode · November 29, 2018, 10:40pm

Hi, I’m following the Bayesian regression tutorial.
I thought that names of priors (here: priors = {‘linear.weight’: w_prior, ‘linear.bias’: b_prior}) have to correspond to parameters in RegressionModel so that they can be matched, but it appears that using other names works as well (as long as they are consistent between model and guide). So how does Pyro match priors with model parameters?
Thanks.

jpchen · November 30, 2018, 2:22am

no the names need to match… can you give an example in which misnaming parameters works?

medcode · December 1, 2018, 12:04am

Hi, thanks for considering my question. I looked into it some more and can see that names need to match but I’m still a bit confused about what is happening when names don’t match. The original output of the Bayesian regression tutorial is:

[iteration 0001] loss: 451.4380
[iteration 0101] loss: 9.5868
[iteration 0201] loss: 1.7982
[iteration 0301] loss: -0.7607
[iteration 0401] loss: -1.2017
[iteration 0501] loss: -1.2463
[iteration 0601] loss: -1.2553
[iteration 0701] loss: -1.2556
[iteration 0801] loss: -1.2098
[iteration 0901] loss: -1.1862
[guide_mean_weight]: 2.995
[guide_log_scale_weight]: -3.917
[guide_mean_bias]: 0.994
[guide_log_scale_bias]: -4.138

If I rename priors/dists to ‘foo’ and ‘bar’ I get:

[iteration 0001] loss: 417.5909
[iteration 0101] loss: 1.0944
[iteration 0201] loss: -0.7607
[iteration 0301] loss: -1.2853
[iteration 0401] loss: -1.3700
[iteration 0501] loss: -1.3785
[iteration 0601] loss: -1.3790
[iteration 0701] loss: -1.3790
[iteration 0801] loss: -1.3790
[iteration 0901] loss: -1.3790
[guide_mean_weight]: 0.728
[guide_log_scale_weight]: -2.993
[guide_mean_bias]: 0.814
[guide_log_scale_bias]: -3.105
[module$$$linear.weight]: 2.991
[module$$$linear.bias]: 1.005

Since I’m calling SVI with loss=Trace_ELBO() how is the loss being minimized? Am I just getting a maximum likelihood estimate for module$$$linear.weight and module$$$linear.bias?
Full code below.

import os
import numpy as np
import torch
import torch.nn as nn

import pyro
from pyro.distributions import Normal
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

if __name__ == '__main__':
    # for CI testing
    smoke_test = ('CI' in os.environ)
    pyro.enable_validation(True)

    N = 100  # size of toy data

    def build_linear_dataset(N, p=1, noise_std=0.01):
        X = np.random.rand(N, p)
        # w = 3
        w = 3 * np.ones(p)
        # b = 1
        y = np.matmul(X, w) + np.repeat(1, N) + np.random.normal(0, noise_std, size=N)
        y = y.reshape(N, 1)
        X, y = torch.tensor(X).type(torch.Tensor), torch.tensor(y).type(torch.Tensor)
        data = torch.cat((X, y), 1)
        assert data.shape == (N, p + 1)
        return data
        
    class RegressionModel(nn.Module):
        def __init__(self, p):
            # p = number of features
            super(RegressionModel, self).__init__()
            self.linear = nn.Linear(p, 1)

        def forward(self, x):
            return self.linear(x)

    regression_model = RegressionModel(1)

    def model(data):
        # Create unit normal priors over the parameters
        loc, scale = torch.zeros(1, 1), 10 * torch.ones(1, 1)
        bias_loc, bias_scale = torch.zeros(1), 10 * torch.ones(1)
        w_prior = Normal(loc, scale).independent(1)
        b_prior = Normal(bias_loc, bias_scale).independent(1)
        # priors = {'linear.weight': w_prior, 'linear.bias': b_prior}
        priors = {'foo': w_prior, 'bar': b_prior}
        # lift module parameters to random variables sampled from the priors
        lifted_module = pyro.random_module("module", regression_model, priors)
        # sample a regressor (which also samples w and b)
        lifted_reg_model = lifted_module()
        with pyro.iarange("map", N):
            x_data = data[:, :-1]
            y_data = data[:, -1]

            # run the regressor forward conditioned on data
            prediction_mean = lifted_reg_model(x_data).squeeze(-1)
            # condition on the observed data
            pyro.sample("obs",
                        Normal(prediction_mean, 0.1 * torch.ones(data.size(0))),
                        obs=y_data)
                        
    softplus = torch.nn.Softplus()

    def guide(data):
        # define our variational parameters
        w_loc = torch.randn(1, 1)
        # note that we initialize our scales to be pretty narrow
        w_log_sig = torch.tensor(-3.0 * torch.ones(1, 1) + 0.05 * torch.randn(1, 1))
        b_loc = torch.randn(1)
        b_log_sig = torch.tensor(-3.0 * torch.ones(1) + 0.05 * torch.randn(1))
        # register learnable params in the param store
        mw_param = pyro.param("guide_mean_weight", w_loc)
        sw_param = softplus(pyro.param("guide_log_scale_weight", w_log_sig))
        mb_param = pyro.param("guide_mean_bias", b_loc)
        sb_param = softplus(pyro.param("guide_log_scale_bias", b_log_sig))
        # guide distributions for w and b
        w_dist = Normal(mw_param, sw_param).independent(1)
        b_dist = Normal(mb_param, sb_param).independent(1)
        # dists = {'linear.weight': w_dist, 'linear.bias': b_dist}
        dists = {'foo': w_dist, 'bar': b_dist}
        # overload the parameters in the module with random samples
        # from the guide distributions
        lifted_module = pyro.random_module("module", regression_model, dists)
        # sample a regressor (which also samples w and b)
        return lifted_module()
    
    optim = Adam({"lr": 0.05})
    svi = SVI(model, guide, optim, loss=Trace_ELBO())
    num_iterations = 1000 if not smoke_test else 2
    
    pyro.clear_param_store()
    data = build_linear_dataset(N)
    for j in range(num_iterations):
        # calculate the loss and take a gradient step
        loss = svi.step(data)
        if j % 100 == 0:
            print("[iteration %04d] loss: %.4f" % (j + 1, loss / float(N)))
            
    for name in pyro.get_param_store().get_all_param_names():
        print("[%s]: %.3f" % (name, pyro.param(name).data.numpy()))

jpchen · December 1, 2018, 7:20am

then random_module is basically doing nothing, none of your module parameters are being lifted to random variables and so it is just running a normal nn (ie there is no connection between your priors and the data). this is evident by your output: the param names module$$$[param] show that the params havent been lifted and they match the MLE estimates of the bayesian version above. in pyro 0.3, you will get a warning thrown that your names dont match.

medcode · December 5, 2018, 11:43am

Terrific, thanks a lot for your help. One more dumb question: how would I change the code if I wanted to keep some of the parameters (say linear.bias in the regression model) constant? To be clear: the final output for module$$$linear.bias should be equal to the initialization of linear.bias and not equal to 1. Can I still use pyro.random_module for this?

p.s. I guess I could use a Delta distribution but this seems a bit ugly…

FunnyBear · May 15, 2019, 7:10am

When we name the priors, can we add the instance name as a prefix, like the following? That is, make sure the prior names are identical between model and guide, and the dimensions are consistent with the variables/tensors that they represent.

def model(data):
    ...
    priors = {'regression_model.linear.weight': w_dist, 'regression_model.linear.bias': b_dist}
    ...
    lifted_module = pyro.random_module("module", regression_model, priors)
    ....

def guide(data):
    ...
    priors = {'regression_model.linear.weight': w_dist, 'regression_model.linear.bias': b_dist}
    ...
    lifted_module = pyro.random_module("module", regression_model, priors)
    ...

I tried this on a convnet, with the convnet instance name as the prefix, and the prediction accuracy is much higher than what I get without the prefix, something like 97% vs 90%. I hope it is OK to deviate the name of the priors from the names in the definition of the convnet

What is the exact process of connecting the priors with the data? I searched the Pyro documentation and the website, but did not find the details. Thanks.

jpchen · May 15, 2019, 11:49pm

What is the exact process of connecting the priors with the data?

it uses the name you would get if you printed the names in nn_module.named_parameters():

for name, _ in nn_module.named_parameters():
    print(name)

nse · August 21, 2019, 12:04am

Hi, I think I’m experiencing a related issue. I’m following along with the bayesian_regression.py example on github, but I’ve adapted it a bit for my particular application. I’m getting the following warning: “UserWarning: Found vars in model but not guide: {‘module$$$linear.weight’, ‘module$$$linear.bias’}” even though I’m using the same parameter naming convention in the model and guide - it seems like something’s off, is this typical?

jpchen · August 21, 2019, 1:39am

please post your code if you require debugging. that error means that you do not have a corresponding random_module with the same names in your guide.

nse · August 21, 2019, 1:11pm

Thanks for your reply. Here’s the code (sorry, don’t know how to post formatted code); as you can see, the “dists” passed to the random_module in guide_custom have the same names as the “priors” passed to the random_module in the model:

import os
import torch
import torch.nn as nn
import pyro
from pyro.distributions import Normal, Uniform, Delta
from pyro.optim import Adam
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO, TracePredictive
from chemio_2 import process_data

smoke_test = ('CI' in os.environ)
assert pyro.__version__.startswith('0.4.1')
pyro.enable_validation(True)
pyro.set_rng_seed(1)


class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.n_subs = 5
        self.fp_len = 512

        self.linear = nn.Linear(int(self.fp_len * self.n_subs), 1)

    def forward(self, x):
        out = self.linear(x)
        return out


regression_model = RegressionModel()
loss_fn = torch.nn.MSELoss()
optim = torch.optim.Adam(regression_model.parameters(), lr=0.005)
num_iterations = 1000
X, Y = process_data('file.csv')
N = len(Y)


def model(x_data, y_data):

    options = dict(dtype=x_data.dtype)

    # weight and bias priors
    loc = torch.zeros_like(regression_model.linear.weight, **options)
    scale = torch.ones_like(regression_model.linear.weight, **options)
    bias_loc = torch.zeros_like(regression_model.linear.bias, **options)
    bias_scale = torch.ones_like(regression_model.linear.bias, **options)

    w_prior = Normal(loc, scale).to_event(1)
    b_prior = Normal(bias_loc, bias_scale).to_event(1)

    priors = {'linear.weight': w_prior, 'linear.bias': b_prior}

    lifted_module = pyro.random_module("module", regression_model, priors)
    lifted_reg_model = lifted_module()

    with pyro.plate("map", N):

        prediction_mean = lifted_reg_model(x_data).squeeze(-1)
        pyro.sample("obs", Normal(prediction_mean, 1), obs=y_data)
        return prediction_mean


def guide_custom(x_data, y_data):

    w_loc = torch.randn(1, (5 * 512), dtype=x_data.dtype)
    w_sc = -3 + 0.05 * torch.randn(1, (5 * 512), dtype=x_data.dtype)
    b_loc = torch.randn(1, dtype=x_data.dtype)
    b_sc = -3 + 0.05 * torch.randn(1, dtype=x_data.dtype)

    mw_param = pyro.param("guide_mean_weight", w_loc)
    sw_param = softplus(pyro.param("guide_scale_weight", w_sc))
    mb_param = pyro.param("guide_mean_bias", b_loc)
    sb_param = softplus(pyro.param("guide_scale_bias", b_sc))
    w_dist = Normal(mw_param, sw_param).to_event(1)
    b_dist = Normal(mb_param, sb_param).to_event(1)

    dists = {'linear.weight': w_dist, 'linear.bias': b_dist}

    lifted_module = pyro.random_module("module", regression_model, dists)

    return lifted_module


softplus = nn.Softplus()
optim = Adam({"lr": 0.005})
svi = SVI(model, guide_custom, optim, loss=Trace_ELBO(), num_samples=1000)


def train():
    pyro.clear_param_store()
    for j in range(num_iterations):
        # calculate the loss and take a gradient step
        loss = svi.step(X, Y)
        if j % 10 == 0:
            print("[iteration %04d] loss: %.4f" % (j, loss / len(Y)))


train()

jpchen · August 21, 2019, 7:02pm

that looks right to me… what do you get if you print the param store?

nse · August 21, 2019, 7:16pm

It correctly prints the names of the params defined in guide_custom, e.g.:

('guide_mean_weight', array([[-0.6825683 ,  0.425686  ,  0.5112802 , ...,  2.9112306 ,
         1.023813  ,  0.49693003]], dtype=float32))
('guide_scale_weight', array([[-3.0119145, -2.9838517, -2.995715 , ..., -3.007945 , -2.9901423,
        -3.0141273]], dtype=float32))
('guide_mean_bias', array([1.4889317], dtype=float32))
('guide_scale_bias', array([-2.9881005], dtype=float32))

It wouldn’t be related to the pyro version, would it? I’m using 0.4.1.

jpchen · August 21, 2019, 9:45pm

ah i see the problem now.

lifted_module = pyro.random_module("module", regression_model, dists)

only wraps the module and overloads params with sample statements. you need to actually call the module to perform a sample step. if you run lifted_module() in your guide, that should take care of your problem.

nse · August 21, 2019, 9:57pm

Yes, that worked - the example on github is correct here, just a typo on my end. Thanks for your help!