I am trying to extend the Bayesian Linear Regression tutorial to accomdate more features (and eventually layers).
In the tutorial, we make predictions by sampling the model several times using guide(None), and averaging the predictions. However, doing this in my implementation always returns the same prediction when given the same points. Since we are sampling the model from the posterior, wouldn’t there at least be some random jitter in the predictions? Otherwise, what’s the point of sampling?
Here is my implementation:
def model(X, y):
# Create unit normal prior over the bias
bias_loc = torch.zeros(1)
bias_scale = 100 * torch.ones(1)
b_prior = Normal(bias_loc, bias_scale).independent(1)
#create unit normal priors over the weights
loc = torch.zeros(1, X.shape[1])
scale = 100 * torch.ones(1, X.shape[1])
w_prior = Normal(loc, scale).independent(2)
#place into dictionary
priors = {'linear.weight': w_prior, 'linear.bias': b_prior}
# lift module parameters to random variables sampled from the priors
lifted_module = pyro.random_module("module", regression_model, priors)
# sample a regressor (which also samples w and b)
lifted_reg_model = lifted_module()
with pyro.iarange("map", X.shape[0]):
#x_data = X
#y_data = y
# run the regressor forward conditioned on data
prediction_mean = lifted_reg_model(X).squeeze(-1)
#print(prediction_mean.shape)
# condition on the observed data
pyro.sample("obs",
Normal(
prediction_mean,
1# * torch.ones(X.shape[0])
),
obs=y.squeeze())
def guide(X, y):
#get the shape of the data
#N = samples
#p = features
if X is None:
p = P
else:
N, p = X.size()
softplus = torch.nn.Softplus()
# define our variational parameters
w_loc = torch.randn(1, p, requires_grad=True)
b_loc = torch.randn(1, requires_grad=True)
# note that we initialize our scales to be pretty narrow
w_log_sig = torch.tensor(-3.0 * torch.randn(1, p) + 0.05, requires_grad=True)
b_log_sig = torch.tensor(-3.0 + 0.05 * torch.randn(1), requires_grad=True)
# register learnable params in the param store
mw_param = pyro.param("guide_mean_weight", w_loc)
sw_param = softplus(pyro.param("guide_log_scale_weight", w_log_sig))
mb_param = pyro.param("guide_mean_bias", b_loc)
sb_param = softplus(pyro.param("guide_log_scale_bias", b_log_sig))
# guide distributions for w and b
w_dist = Normal(mw_param, sw_param).independent(2)
b_dist = Normal(mb_param, sb_param).independent(1)
dists = {'linear.weight': w_dist, 'linear.bias': b_dist}
# overload the parameters in the module with random samples
# from the guide distributions
lifted_module = pyro.random_module("module", regression_model, dists)
# sample a regressor (which also samples w and b)
return lifted_module()
P = 5
LR = 0.005
H = 1
N = 10000
num_iterations = 10000
pyro.clear_param_store()
optim = Adam({"lr": LR})
svi = SVI(model, guide, optim, loss=Trace_ELBO())
regression_model = torch.nn.Sequential(torch.nn.Linear(P, 1))
def main():
losses = []
pyro.clear_param_store()
X, y = build_linear_dataset(N, P)
for j in range(num_iterations):
# calculate the loss and take a gradient step
loss = svi.step(X, y)
losses.append(loss / float(N))
if (j + 1) % 100 == 0:
print("[iteration %04d] loss: %.4f" % (j + 1, loss / float(N)))
#plot losses
plt.plot(losses)
plt.title("ELBO")
plt.xlabel("step")
plt.ylabel("loss");
if __name__ == '__main__':
main()
N = 200
P = 5
X_test, y_test = build_linear_dataset(N, P)
loss = nn.MSELoss()
y_preds = torch.zeros(N, 1)
for i in range(10):
# guide does not require the data
sampled_reg_model = guide(None, None)
y_preds = y_preds + sampled_reg_model(X_test)
# take the average of the predictions
y_preds = y_preds / 10
print("Validation Loss: ", loss(y_preds, y_test).item())