Thank you again for your reply. I am trying to read the documentation and understand the code as much as I can, and I can understand all the math parts of the model descriptions. But I am struggling with how to implement my model in a Python code, because I just started learning Python couple months ago, and I haven’t done much Bayesian computing before. This task is (probably too much of) a big jump ahead for me, but I am in a need to finish this project. I thank you for your patience.

I tried to modify my code as the below. If you don’t mind, could you please take a look at my code and suggest any fix if needed?

In particular, I am wondering whether I have done the enumeration right for my discrete latent variable `y`

(every parameter for my model is continuous except `y`

, which has `Multinomial`

distribution);

Thank you very much once again for your help.

```
model = RobertaForMultipleChoice.from_pretrained('roberta-large')
module.to_pyro_module_(model)
model.roberta._dummy_param =
nn.Parameter(torch.tensor(0.).to(dtype=model.dtype, device=model.device))
# Now we can attempt to be fully Bayesian:
for m in model.modules():
for name, value in list(m.named_parameters(recurse=False)):
if name != "_dummy_param":
setattr(m, name, module.PyroSample(prior=dist.Normal(0, 1)
.expand(value.shape)
.to_event(value.dim())))
# add likelihood function to the exisiting frequentist Transformer model.
class MyModel(PyroModule):
def __init__(self, model, name=""):
self._pyro_name = name
self._pyro_context = pyro.nn.module._Context()
self._pyro_params = model.parameters()
self._modules = model.modules()
super(MyModel, self).__init__()
def forward(self, model, input_ids, attention_mask, mc_labels = None):
# retrieve prediction_scores (y)
if mc_labels != None:
prediction_scores = model(input_ids=input_ids,
attention_mask=attention_mask,
mc_labels=mc_labels)[2]
softmax_tensor = nn.Softmax(dim=-1)(prediction_scores)
if mc_labels == torch.tensor([0]):
mc_label_tensor = torch.tensor([[1.,0.,0.,0.]])
elif mc_labels == torch.tensor([1]):
mc_label_tensor = torch.tensor([[0.,1.,0.,0.]])
elif mc_labels == torch.tensor([2]):
mc_label_tensor = torch.tensor([[0.,0.,1.,0.]])
elif mc_labels == torch.tensor([3]):
mc_label_tensor = torch.tensor([[0.,0.,0.,1.]])
else:
prediction_scores = model(input_ids=input_ids,
attention_mask=attention_mask)[1]
softmax_tensor = nn.Softmax(dim=-1)(prediction_scores)
# for each data, y (prediction scores) has 4 classes.
# Hence the multinomial distribution with total_size =1 and
# prob= nn.softmax(prediction_scores)
pyro.sample('y',
dist.Multinomial(1, probs = softmax_tensor),
obs = mc_label_tensor)
return prediction_scores
### ERROR OCCURS HERE
my_model = MyModel(model)
# define guide
guide = guides.AutoDiagonalNormal(poutine.block(my_model, hide = ['y']))
# parameters for training
optimizer = Adam({"lr": 0.000005200})
scheduler = pyro.optim.StepLR({'optimizer': optimizer_3,
'optim_args': {'lr': 0.000005200}})
svi = SVI(my_model, guide, optimizer, loss=TraceEnum_ELBO(max_plate_nesting=0))
# initialize the best_guide
best_guide = None
best_svi_loss = float("inf")
# turn on a training mode
my_model.train()
for i in range(epoch):
for i in range(num_iter):
# initialize total_loss to 0
total_svi_loss = 0
# calculate the loss and take a gradient step
svi_loss = svi.step(model, input_ids = input,
attention_mask = attention_mask,
labels = labels)
# update the with the calculated loss
total_svi_loss = total_svi_loss + svi_loss
if m % log_interval == 0 and m > 0:
cur_svi_loss = total_svi_loss / log_interval
print('| epoch {:3d} | loss {:5.4f} |'.format(
epoch,cur_svi_loss ))
total_svi_loss = 0
if cur_svi_loss < best_svi_loss:
best_val_loss = cur_svi_loss
best_guide = guide
### MAKING PREDICTIONS
# Turn on the evaluation mode
my_model.eval()
# calculate prediction scores
pred_obj = Predictive(my_model, guide=best_guide, num_samples = 100)
prediction_scores = pred_obj.call(model, input_ids=test_input,
attention_mask = attention_mask_test).detach()
```