I am trying to fit a simple 2-part mixture model in which one component is a normal distribution and the other is a point mass at 0:

Y \sim p * N(\mu, 1) + (1-p)*\delta_0

Where \delta_0 is the point mass at 0 and \mu will be non-zero. I had read that this can be achieved through the MaskedMixture distribution:

```
import torch
import pyro
import numpy as np
import pyro.distributions as dist
from functools import partial, reduce
from torch.distributions import constraints
from pyro.infer import SVI, Trace_ELBO, Predictive
num_iter = 5000
lr0 = 1
gamma = 0.1
lrd = gamma ** (1/num_iter)
optim = pyro.optim.ClippedAdam({'lr': lr0, 'lrd': lrd, 'betas': (0.95, 0.999)})
np.random.seed(37)
dat = torch.tensor(np.array(list(np.zeros(300)) + list(np.random.normal(loc=2., size=700))))
def model(dat=None):
if dat is None:
N = 1000
else:
N = dat.shape[0]
p = pyro.sample('p', dist.Beta(2., 2.))
mu = pyro.sample('mu', dist.Normal(0., 1.))
delta = dist.Delta(torch.tensor(0.))
normal = dist.Normal(mu, 1.)
with pyro.plate('dat', N):
z = pyro.sample('z', dist.RelaxedBernoulliStraightThrough(temperature=torch.tensor(1.), probs=p)).bool()
x = pyro.sample('obs', dist.MaskedMixture(z, normal, delta), obs=dat)
return x
def guide(dat):
a = pyro.param('a', torch.tensor(2.), constraint=constraints.positive)
b = pyro.param('b', torch.tensor(2.), constraint=constraints.positive)
p = pyro.sample('p', dist.Beta(a, b))
mu_loc = pyro.param('mu_loc', torch.tensor(0.))
mu_scale = pyro.param('mu_scale', torch.tensor(0.1), constraint=constraints.positive)
mu = pyro.sample('mu', dist.Normal(mu_loc, mu_scale))
with pyro.plate('dat', dat.shape[0]):
z = pyro.sample('z', dist.RelaxedBernoulliStraightThrough(temperature=torch.tensor(1.), probs=p))
```

I have verified that I can sample from the prior and get an equal mixture of exactly 0 and non-zero samples. I am using SVI for inference:

```
losses = []
pyro.clear_param_store()
svi = SVI(model, guide, optim, loss=Trace_ELBO())
for j in range(num_iter):
loss = svi.step(dat)
losses.append(loss)
```

However, 2 things are weird and I don’t understand them. The first is that all of the losses are `inf`

. The other is that while the mass of the posterior of the Normal component does seem to shift far from 0 (it is 2 above), the posterior for the mixing coefficient (which is 0.3 above) does not seem to be updating and looks similar to the prior:

```
pred = Predictive(model, guide=guide, num_samples=1000,
return_sites=['p', 'mu'])
samples = pred(dat)
[samples['p'].mean(), samples['p'].std(), samples['mu'].mean(), samples['mu'].std()]
## [tensor(0.5083), tensor(0.2275), tensor(1.3934), tensor(0.0420)]
```

That is, the prior Beta(2, 2) has mean 0.5 and standard deviation ~0.23, which are the first 2 values.