More adventures with poutine.nan_mask

gbernstein · December 8, 2021, 10:53pm

Hi, back again with a nan_mask question. This time I’m getting unexpected SVI behavior from instances which have all of their descendant observations nan’d out.

Toy model is below. Everything is drawn from a Beta distribution. C tracks the mean of A and B; D tracks C.

The issue in the simplest case is when i=2, but all D_1j are observed as nan, e.g.
D = [[0.9, 0.9, 0.9, ..., 0.9], [nan, nan, nan, ... nan]]. In that case A, B_0, and C_0 have descendant observations at 0.9, but B_1 and C_1 only have nan descendant observations. C_1 still has some indirect connection to the non-nan observations via C_1 <- A -> C_0 -> D_0j.

B_1, however, is d-separated due to the v-structure and therefore I would expect SVI to just draw it from the prior. I’ve confirmed this is the case when A is removed from the model. But when A is present the prediction for B_1 clearly deviates from the prior mean of 0.75, as below. Note that this happens in the opposite direction if the prior is flipped to Beta(0.3, 1.0): then B_1 deviates below the prior mean of 0.25. And if the prior is uniform at Beta(1.0, 1.0) then B_1 stays around the expectation of 0.5…

A:  [0.8]
B:  [0.89, 0.89]
C:  [0.86, 0.84]

Any idea why? The real model is much larger and more complex, and it would be much better to not have to worry about filtering things out that may be missing all descendant observations, and instead just be able to assume any such predictions are drawn from the prior.

MWE

Note: making sure to properly handle nan observations as fixed in my previous post.

import pyro
import torch
import pyro.distributions as dist
import torch.distributions.constraints as constraints
from typing import List

num_BC = 2
num_D = 100


def model(obs: torch.Tensor, nan_mask: torch.Tensor):

    A = pyro.sample('A', dist.Beta(1, .3))

    with pyro.plate('BC_plate', size=num_BC, dim=-1):
        B = pyro.sample('B', dist.Beta(1, .3))

        parent_values = torch.stack((B, A.expand(B.shape)))
        combined_parent_values = torch.mean(parent_values, dim=0)
        C = pyro.sample('C', dist.Beta(10 * combined_parent_values, 10 * (1 - combined_parent_values)))

        with pyro.plate('D_plate', size=num_D, dim=-2):
            C_expanded = C.expand(num_D, num_BC)
            with pyro.poutine.mask(mask=nan_mask):
                D = pyro.sample('D', dist.Beta(10 * C_expanded, 10 * (1 - C_expanded)), obs=obs)


def guide(obs: torch.Tensor, nan_mask: torch.Tensor):

    A_concentrations = get_concentration_params('A', 1)
    pyro.sample('A', dist.Beta(*A_concentrations))

    with pyro.plate('BC_plate', size=num_BC, dim=-1):
        B_concentrations = get_concentration_params('B', num_BC)
        pyro.sample('B', dist.Beta(*B_concentrations))

        C_concentrations = get_concentration_params('C', num_BC)
        pyro.sample('C', dist.Beta(*C_concentrations))


def get_concentration_params(site_name: str, size: int) -> List[pyro.param]:
    # Beta distribution concentration1 and concentration0 parameters
    return [
            pyro.param(f'{site_name}_concentration{i}', torch.ones(size), constraint=constraints.greater_than(0.1))
            for i in reversed(range(2))
        ]


def main():

    obs = torch.full((num_D, num_BC), 0.9)
    obs[:, -1] = float('nan')

    nan_mask = ~torch.isnan(obs)
    obs[torch.isnan(obs)] = 0.4242  # to avoid nan loss

    svi = pyro.infer.SVI(
        model,
        guide,
        pyro.optim.Adam({"lr": 0.1, "betas": (0.95, 0.999)}),
        loss=pyro.infer.Trace_ELBO(),
    )

    for _ in range(10000):
        svi.step(obs, nan_mask)

    param_store = pyro.get_param_store()

    for site_id in ['A', 'B', 'C']:
        concentration1 = param_store[f'{site_id}_concentration1']
        concentration0 = param_store[f'{site_id}_concentration0']
        beta_mean = concentration1 / (concentration1 + concentration0)
        print(f'{site_id}: ', [round(float(v), 2) for v in beta_mean])


if __name__ == '__main__':
    main()

gbernstein · December 9, 2021, 2:58pm

I thought maybe pyro was doing something weird with the descendant of the v-structure being partially observed as a single sample site, but I just confirmed the same thing happens with for loops and separate sample sites instead of plates. So still something with the nanmask I guess.

MWE with no plates

import pyro
import torch
import pyro.distributions as dist
import torch.distributions.constraints as constraints
from typing import List

num_BC = 2
num_D = 100


def model(obs: torch.Tensor, nan_mask: torch.Tensor):

    A = pyro.sample('A', dist.Beta(1, .3))

    for i in range(num_BC):
        B = pyro.sample(f'B_{i}', dist.Beta(1, .3))

        combined_parent_values = (A + B) / 2
        C = pyro.sample(f'C_{i}', dist.Beta(10 * combined_parent_values, 10 * (1 - combined_parent_values)))

        for j in range(num_D):
            with pyro.poutine.mask(mask=nan_mask[j, i]):
                D = pyro.sample(f'D_{i}_{j}', dist.Beta(10 * C, 10 * (1 - C)), obs=obs[j, i])


def guide(obs: torch.Tensor, nan_mask: torch.Tensor):

    A_concentrations = get_concentration_params('A_0', 1)
    pyro.sample('A', dist.Beta(*A_concentrations))

    for i in range(num_BC):
        B_concentrations = get_concentration_params(f'B_{i}', 1)
        pyro.sample(f'B_{i}', dist.Beta(*B_concentrations))

        C_concentrations = get_concentration_params(f'C_{i}', 1)
        pyro.sample(f'C_{i}', dist.Beta(*C_concentrations))


def get_concentration_params(site_name: str, size: int) -> List[pyro.param]:
    # Beta distribution concentration1 and concentration0 parameters
    return [
            pyro.param(f'{site_name}_concentration{i}', torch.ones(size), constraint=constraints.greater_than(0.1))
            for i in reversed(range(2))
        ]


def main():

    obs = torch.full((num_D, num_BC), 0.9)
    obs[:, -1] = float('nan')

    nan_mask = ~torch.isnan(obs)
    obs[torch.isnan(obs)] = 0.4242  # to avoid nan loss

    svi = pyro.infer.SVI(
        model,
        guide,
        pyro.optim.Adam({"lr": 0.1, "betas": (0.95, 0.999)}),
        loss=pyro.infer.Trace_ELBO(),
    )

    import tqdm
    for _ in tqdm.tqdm(range(10000)):
        svi.step(obs, nan_mask)

    param_store = pyro.get_param_store()

    for site_id, num in {'A': 1, 'B': num_BC, 'C': num_BC}.items():
        beta_means = []
        for i in range(num):
            concentration1 = param_store[f'{site_id}_{i}_concentration1']
            concentration0 = param_store[f'{site_id}_{i}_concentration0']
            beta_means.append(concentration1 / (concentration1 + concentration0))
        print(f'{site_id}: ', [round(float(v), 2) for v in beta_means])


if __name__ == '__main__':
    main()

martinjankowiak · December 9, 2021, 3:19pm

have you tried initializing everything at the prior? are you sure this isn’t an optimization problem (as opposed to an incorrect elbo/elbo gradient)?

gbernstein · December 9, 2021, 7:29pm

Aha, good suggestion. Though interestingly it turned out that setting the init to the prior didn’t matter, but it was the init magnitude being set too small that was the issue.

E.g. in the MWE above the init is set to Beta(1.0, 1.0), but if I set it to Beta(10.0, 10.0) then B_1 is drawn from the prior regardless of the prior setting.

Not sure why that init magnitude affects the optimization, but so be it. Thanks!

gbernstein · April 13, 2022, 2:17pm

Hi @martinjankowiak, back with the same type of optimizer question but I made the model even simpler: a v-structure with everything drawn from a Normal distribution, and the child mean tracking the product of the parent means.

parentA -> child <- parentB

parentA ~ Normal(0.75, 0.25)
parentB ~ Normal(0.75, 0.25)
child   ~ Normal(parentA * parentB, 0.1)

Even if I initialize the parameters to the prior, SVI still isn’t able to reproduce the prior parameters. It works with MCMC so I think/assume it’s an ADAM optimizer issue. Playing around with the learning rate and more steps didn’t get me anywhere. The child’s mean sometimes comes out at the correct product of the parents’ means, but the parent means never really converge on 0.75, usually ending up lower.

Any tips on getting the optimizer to work here? Thanks.

MWE

import pyro
import torch
import pyro.distributions as dist
import torch.distributions.constraints as constraints


def model():
    parentA = pyro.sample('parentA', dist.Normal(0.75, 0.25))
    parentB = pyro.sample('parentB', dist.Normal(0.75, 0.25))
    pyro.sample('child', dist.Normal(parentA * parentB, 0.1))


def guide():
    mean = pyro.param('parentA_mean', torch.tensor(.75))
    std = pyro.param('parentA_std', torch.tensor(.25), constraint=constraints.positive)
    pyro.sample('parentA', dist.Normal(mean, std))

    mean = pyro.param('parentB_mean', torch.tensor(.75))
    std = pyro.param('parentB_std', torch.tensor(.25), constraint=constraints.positive)
    pyro.sample('parentB', dist.Normal(mean, std))

    mean = pyro.param('child_mean', torch.tensor(.56))
    std = pyro.param('child_std', torch.tensor(.1), constraint=constraints.positive)
    pyro.sample('child', dist.Normal(mean, std))


svi = pyro.infer.SVI(
    model,
    guide,
    pyro.optim.Adam({"lr": 0.005, "betas": (0.95, 0.999)}),
    loss=pyro.infer.Trace_ELBO(),
)

[svi.step() for _ in range(10000)]


param_store = pyro.get_param_store()
for node_id in ['parentA', 'parentB', 'child']:
    mean = param_store[f'{node_id}_mean'].detach().numpy()
    std = param_store[f'{node_id}_std'].detach().numpy()
    print(f'{node_id} ~ Normal({mean:.2f}, {std:.2f})')

martinjankowiak · April 13, 2022, 2:34pm

have you tried MultivariateNormal? what if this an artifact of using a mean field guide?

gbernstein · April 13, 2022, 8:47pm

By MultivariateNormal do you mean in the model, like below? That doesn’t really work in the full model because each different parent type can have different number of instances, so they each have their own plates.

def model():
    parents = pyro.sample('parents', dist.MultivariateNormal(
        torch.tensor([0.75, 0.75]),
        covariance_matrix=torch.diag(torch.tensor([0.25, 0.25])) ** 2),
    )
    pyro.sample('child', dist.Normal(torch.prod(parents), 0.1))

Or by to get rid of the mean field guide you meant an AutoDiagonalNormal or AutoMultivariateNormal? Both of those push the predicted means further below the the prior means too. AutoDelta guide nails the means but then doesn’t provide any standard deviations to use as confidence scores…

I also tried writing out the MultivariateNormal guide by hand but that does maybe a bit worse than the AutoGuides.

def guide():
    mean = pyro.param('parent_mean', torch.tensor([.75, .75]))
    parent_std = torch.diag(pyro.param('parent_std', torch.tensor([.25, .25]) ** 2, constraint=constraints.positive))
    pyro.sample('parents', dist.MultivariateNormal(loc=mean, covariance_matrix=parent_std))

Thanks.

martinjankowiak · April 14, 2022, 6:35pm

sorry i meant in the guide.

what makes you sure this isn’t just a limitation of your variational family? for example you could try a flow based guide. alternatively you could use an IWAE elbo (renyi with alpha=0)