I implemented a simple HMM, the emission is from a discrete label to a word and the transition is from a hidden discrete label to another one. But I got negative losses. I am not sure if this is normal or correct? Please find the code below:
def model(tokens, sequences, lengths, args, batch_size=None, include_prior=True):
num_sequences, max_length, nannotators = map(int, sequences.shape)
vocabulary_size = tokens.max().item()
with poutine.mask(mask=include_prior):
# hidden_dim: number of hidden labels
probs_x = pyro.sample("probs_x",
dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1))
probs_w = pyro.sample("probs_w",
dist.Dirichlet(torch.ones(args.hidden_dim, vocabulary_size+1)).to_event(1))
with pyro.plate("sequences", num_sequences, batch_size) as batch:
lengths = lengths[batch]
x = 0
for t in pyro.markov(range(lengths.max())):
with poutine.mask(mask=(t<lengths)):
x = pyro.sample("x_{}".format(t),
dist.Categorical(probs_x[x]),
infer={"enumerate":"parallel"})
valid_data = tokens[batch,t].clone()
valid_data[valid_data == -1] = 0
w = pyro.sample("w_{}".format(t),
dist.Categorical(probs_w[x]),
obs=valid_data)
The loss at each iteration:
| 0|-947.244384765625|
| 1|-952.4456787109375|
| 2|-957.525146484375|
| 3|-962.5042114257812|
| 4|-967.3865966796875|
| 5|-972.170166015625|
The guide used when training:
guide = AutoDelta(poutine.block(model,expose_fn=lambda msg: msg["name"].startswith("probs_")))
elbo = TraceEnum_ELBO()
optim = Adam({'lr':args.learning_rate})
svi = SVI(model, guide, optim, elbo)