Bayesian model accuracy not improving

SpaceMeerkat · December 19, 2018, 11:22am

I’m trying to follow the tutorial found at: Making Your Neural Network Say “I Don’t Know” — Bayesian NNs using Pyro and PyTorch | by Paras Chopra | Towards Data Science

But for some reason my regression model doesn’t seem to be able to improve in accuracy. It trains but sort of stalls around the first accuracy score.

I’ve put the example code below, where in this case it’s a simple linear regression, and it should work out of the box:

import torch
import numpy as np
from torchvision import transforms
import random
import pyro
from pyro.distributions import Normal
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam


class NN(torch.nn.Module):
	
def __init__(self, input_size, hidden_size, output_size):
    super(NN, self).__init__()
    self.fc1 = torch.nn.Linear(input_size, hidden_size)
    self.out = torch.nn.Linear(hidden_size, output_size)
    
def forward(self, x):
    output = self.fc1(x)
    return output

net = NN(1, 1, 1)

x = np.random.uniform(0,1,500)
random.shuffle(x)
m = 3
c = 4
y = (m*x) + c 

y = y/y.max()

transform = transforms.Compose([transforms.ToTensor()])

x = torch.tensor(x,dtype=torch.float).unsqueeze(-1)
y = torch.tensor(y,dtype=torch.float)


def model(x_data, y_data):

fc1w_prior = Normal(loc=torch.zeros_like(net.fc1.weight), scale=torch.ones_like(net.fc1.weight))
fc1b_prior = Normal(loc=torch.zeros_like(net.fc1.bias), scale=torch.ones_like(net.fc1.bias))

outw_prior = Normal(loc=torch.zeros_like(net.out.weight), scale=torch.ones_like(net.out.weight))
outb_prior = Normal(loc=torch.zeros_like(net.out.bias), scale=torch.ones_like(net.out.bias))

priors = {'fc1.weight': fc1w_prior, 'fc1.bias': fc1b_prior,  'out.weight': outw_prior, 'out.bias': outb_prior}

# lift module parameters to random variables sampled from the priors
lifted_module = pyro.random_module("module", net, priors)
# sample a regressor (which also samples w and b)
lifted_reg_model = lifted_module()
scale = pyro.sample("sigma", Uniform(0., 10.))
lhat = torch.nn.LogSoftmax(lifted_reg_model(x_data))

pyro.sample("obs", pyro.distributions.Normal(lhat, scale), obs=y_data)



softplus = torch.nn.Softplus()

def guide(x_data, y_data):

# First layer weight distribution priors
fc1w_mu = torch.randn_like(net.fc1.weight)
fc1w_sigma = torch.randn_like(net.fc1.weight)
fc1w_mu_param = pyro.param("fc1w_mu", fc1w_mu)
fc1w_sigma_param = softplus(pyro.param("fc1w_sigma", fc1w_sigma))
fc1w_prior = Normal(loc=fc1w_mu_param, scale=fc1w_sigma_param)
# First layer bias distribution priors
fc1b_mu = torch.randn_like(net.fc1.bias)
fc1b_sigma = torch.randn_like(net.fc1.bias)
fc1b_mu_param = pyro.param("fc1b_mu", fc1b_mu)
fc1b_sigma_param = softplus(pyro.param("fc1b_sigma", fc1b_sigma))
fc1b_prior = Normal(loc=fc1b_mu_param, scale=fc1b_sigma_param)
# Output layer weight distribution priors
outw_mu = torch.randn_like(net.out.weight)
outw_sigma = torch.randn_like(net.out.weight)
outw_mu_param = pyro.param("outw_mu", outw_mu)
outw_sigma_param = softplus(pyro.param("outw_sigma", outw_sigma))
outw_prior = Normal(loc=outw_mu_param, scale=outw_sigma_param).independent(1)
# Output layer bias distribution priors
outb_mu = torch.randn_like(net.out.bias)
outb_sigma = torch.randn_like(net.out.bias)
outb_mu_param = pyro.param("outb_mu", outb_mu)
outb_sigma_param = softplus(pyro.param("outb_sigma", outb_sigma))
outb_prior = Normal(loc=outb_mu_param, scale=outb_sigma_param)
priors = {'fc1.weight': fc1w_prior, 'fc1.bias': fc1b_prior, 'out.weight': outw_prior, 'out.bias': outb_prior}

lifted_module = pyro.random_module("module", net, priors)

return lifted_module()

optim = Adam({"lr": 0.01})
svi = SVI(model, guide, optim, loss=Trace_ELBO())

num_iterations = 50
loss = 0

for j in range(num_iterations):
loss = 0
for idx, data in enumerate(x):
    # calculate the loss and take a gradient step
    loss += svi.step(data, y[idx])
normalizer_train = len(x)
total_epoch_loss_train = loss / normalizer_train

print("Epoch ", j, " Loss ", total_epoch_loss_train)

###############################################################################

num_samples = 10
def predict(x):
sampled_models = [guide(None, None) for _ in range(num_samples)]
yhats = [model(x).data for model in sampled_models]
mean = torch.mean(torch.stack(yhats), 0)
return mean.numpy()

print('Prediction when network is forced to predict')
correct = 0
total = 0
predictions = []
for idx, x_test in enumerate(x):
predicted = predict(x_test)
predictions.append( predict(x_test))
total += 1
correct += np.abs((predicted - y[idx].numpy()))
print("average dist: %.2f " % (correct / total))
predictions = np.hstack(predictions)
print(np.max(predictions), np.min(predictions))

Any help on this matter would be greatly appreciated, thanks in advance!

martinjankowiak · December 19, 2018, 7:47pm

to be blunt: the person who wrote that tutorial doesn’t understand variational inference and didn’t use pyro correctly. consequently i wouldn’t recommend trying to reproduce what was done there. broadly, bayesian neural networks are an active area of research and it can be challenging to get them to work, especially if one is trying to be bayesian about a large number of neural network parameters

SpaceMeerkat · December 21, 2018, 11:35am

Haha! Okay, I understand. With that in mind, would a bayesian NN going from 10 inputs to 5 nodes to 1 output node be something unsuited to using Pyro for?

martinjankowiak · December 21, 2018, 4:10pm

that is a very tiny neural network so even the most naive approaches should work. pyro can certainly handle a problem of this sort. still, one needs to be careful to e.g.

handle mini-batching correctly
initialize the guide reasonably
use an appropriate optimizer, learning rate, etc

i suspect you might more easily be able to get things to work if you do gradient steps on mini-batches of data instead of single datapoints