I implemented a simple HMM, the emission is from a discrete label to a word and the transition is from a hidden discrete label to another one. But I got negative losses. I am not sure if this is normal or correct? Please find the code below:
def model(tokens, sequences, lengths, args, batch_size=None, include_prior=True):
    num_sequences, max_length, nannotators = map(int, sequences.shape)
    vocabulary_size = tokens.max().item()
    with poutine.mask(mask=include_prior):
        # hidden_dim: number of hidden labels
        probs_x = pyro.sample("probs_x",
                              dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1))
        probs_w = pyro.sample("probs_w",
                              dist.Dirichlet(torch.ones(args.hidden_dim, vocabulary_size+1)).to_event(1))
        with pyro.plate("sequences", num_sequences, batch_size) as batch:
            lengths = lengths[batch]
            x = 0
            for t in pyro.markov(range(lengths.max())):
                with poutine.mask(mask=(t<lengths)):
                    x = pyro.sample("x_{}".format(t),
                                    dist.Categorical(probs_x[x]),
                                    infer={"enumerate":"parallel"})
                    valid_data = tokens[batch,t].clone()
                    valid_data[valid_data == -1] = 0
                    w = pyro.sample("w_{}".format(t),
                                    dist.Categorical(probs_w[x]),
                                    obs=valid_data)
The loss at each iteration:
|    0|-947.244384765625|
|    1|-952.4456787109375|
|    2|-957.525146484375|
|    3|-962.5042114257812|
|    4|-967.3865966796875|
|    5|-972.170166015625|
The guide used when training:
guide = AutoDelta(poutine.block(model,expose_fn=lambda msg: msg["name"].startswith("probs_")))
elbo = TraceEnum_ELBO()
optim = Adam({'lr':args.learning_rate})
svi = SVI(model, guide, optim, elbo)