Multiple coins on coin fairness example

marcoinacio · September 25, 2018, 4:50pm

Hi there, I’m trying the generalize the example of the coin fairness in the tutorial to the case of many coins from a single mint, each coin being flipped 100 times. However, I’m unable to recover the parameters (alpha, beta) of the Beta distribution of the mint (coin fairness generation distribution).

It seems that alpha / beta ratio is correct, but the alpha and beta parameters themselves seem to grow without bound.

Three extra questions: is my way handling subsample OK? What about the 2 iaranges inside model function? Is latent_fairness necessary inside the guide or do I even a local parameter for each coin in guide?

    import math
    import os
    import torch
    import torch.distributions.constraints as constraints
    import pyro
    from pyro.optim import Adam, Adamax
    from pyro.infer import SVI, Trace_ELBO
    import pyro.distributions as dist
    from scipy import stats
    import numpy as np
    from torch.utils import data

    # this is for running the notebook in our testing framework
    smoke_test = ('CI' in os.environ)
    n_steps = 2 if smoke_test else 2000

    # enable validation (e.g. validate parameters of distributions)
    pyro.enable_validation(True)

    # clear the param store in case we're in a REPL
    pyro.clear_param_store()

    # create some data with 6 observed heads and 4 observed tails
    np.random.seed(10)
    dataset = stats.beta.rvs(20, 5, size=1_000_000)
    dataset = [stats.bernoulli.rvs(dataset, size=1_000_000)
        for x in range(100)]
    dataset = np.array(dataset).T
    dataset = torch.FloatTensor(dataset)

    def model(data, total_data_len):
        loc_prior_1 = torch.tensor(12.0).cuda()
        loc_prior_2 = torch.tensor(7.0).cuda()
        scale_prior_1 = torch.tensor(5.0).cuda()
        scale_prior_2 = torch.tensor(5.0).cuda()

        alpha = torch.functional.F.softplus(pyro.sample("alpha", dist.Normal(loc_prior_1, scale_prior_1)))
        beta = torch.functional.F.softplus(pyro.sample("beta", dist.Normal(loc_prior_2, scale_prior_2)))

        with pyro.iarange('observed_data_inter_coins', total_data_len, len(data)):
            f = pyro.sample("latent_fairness",
                dist.Beta(alpha, beta).expand_by([len(data)]))
            with pyro.iarange('observed_data_intra_coin'):
                pyro.sample('obs', dist.Bernoulli(f),
                    obs=data.transpose(0, 1))

    def guide(data, total_data_len):
        loc1 = pyro.param("loc1", torch.tensor(12.0).cuda())
        loc2 = pyro.param("loc2", torch.tensor(7.0).cuda())

        scale1 = pyro.param("scale1", torch.tensor(5.0).cuda(),
                            constraint=constraints.positive)
        scale2 = pyro.param("scale2", torch.tensor(5.0).cuda(),
                            constraint=constraints.positive)
        alpha = torch.functional.F.softplus(pyro.sample("alpha", dist.Normal(loc1, scale1)))
        beta = torch.functional.F.softplus(pyro.sample("beta", dist.Normal(loc2, scale2)))

        #with pyro.iarange('observed_data_inter_coins', total_data_len, len(data)):
        #    f = pyro.sample("latent_fairness",
        #        dist.Beta(alpha, beta).expand_by([len(data)]))

    # setup the optimizer
    adam_params = {"lr": 0.5, "betas": (0.90, 0.999)}
    optimizer = Adam(adam_params)

    # setup the inference algorithm
    svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

    # do gradient steps

    for step in range(100):
        if step == 10:
            adam_params = {"lr": 0.05, "betas": (0.90, 0.999)}
            optimizer = Adam(adam_params)
        if step == 50:
            adam_params = {"lr": 0.005, "betas": (0.90, 0.999)}
            optimizer = Adam(adam_params)
        sampler = data.DataLoader(dataset, batch_size=100_000, shuffle=True,
            drop_last=True, pin_memory=True, num_workers=1)
        for databat in sampler:
            svi.step(databat.cuda(async=True), len(dataset))
        #print('.', end='', flush=True)
        print('Status of step', step)
        # grab the learned variational parameters
        print(pyro.param("loc1").item())
        print(pyro.param("loc2").item())
        print(">>", pyro.param("loc1").item() / pyro.param("loc2").item())
        print(pyro.param("scale1").item())
        print(pyro.param("scale2").item(), flush=True)

fritzo · October 15, 2018, 5:29pm

Can you try again using PyTorch 0.4.0? We’ve encountered many .expand() bugs in PyTorch 0.4.1.

marcoinacio · November 1, 2018, 10:33pm

Sorry for the delay. I’ve had some trouble with shared library when downgrading to Pytorch 0.4 using conda…

So, I tryed Pytorch 0.4 but it seems that it did not fix the problem. Maybe this is a case where the variance of the estimator is huge?

One question, does Pyro store information across each svi.step call other than the parameters? e.g.: does it store information related to pyro.sample?

import math
import os
import torch
import torch.distributions.constraints as constraints
import pyro
from pyro.optim import Adam, Adamax
from pyro.infer import SVI, Trace_ELBO
import pyro.distributions as dist
from scipy import stats
import numpy as np
from torch.utils import data

# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)
n_steps = 2 if smoke_test else 2000

# enable validation (e.g. validate parameters of distributions)
pyro.enable_validation(True)

# clear the param store in case we're in a REPL
pyro.clear_param_store()

np.random.seed(10)
dataset = stats.beta.rvs(20, 5, size=100_000)
dataset = [stats.bernoulli.rvs(dataset, size=100_000)
    for x in range(100)]
dataset = np.array(dataset).T
dataset = torch.FloatTensor(dataset)
dataset = torch.Tensor.contiguous(dataset)

def model(data, len_dataset):
    shape_prior_1 = torch.tensor(1.0).cuda()
    shape_prior_2 = torch.tensor(1.0).cuda()
    rate_prior_1 = torch.tensor(0.5).cuda()
    rate_prior_2 = torch.tensor(1.0).cuda()

    alpha = pyro.sample("alpha", dist.Gamma(shape_prior_1, rate_prior_1))
    beta = pyro.sample("beta", dist.Gamma(shape_prior_2, rate_prior_2))

    with pyro.iarange('observed_data_inter_coins', len_dataset, data.size(1), use_cuda=True):
        f = pyro.sample("latent_fairness",
            dist.Beta(alpha, beta).expand_by([data.size(1)]))
        with pyro.iarange('observed_data_intra_coin', use_cuda=True):
            pyro.sample('obs', dist.Bernoulli(f).expand([data.size(0), data.size(1)]), obs=data)

# from pyro.infer.mcmc import MCMC, HMC, NUTS
# from pyro.infer import EmpiricalMarginal
# nuts_kernel = NUTS(model, adapt_step_size=True)
# mcmc_run = MCMC(nuts_kernel, num_samples=500, warmup_steps=300).run(dataset.transpose(0,1).contiguous())
# posterior = EmpiricalMarginal(mcmc_run, ['alpha', 'beta'])
# posterior.mean

def guide(data, len_dataset):
    shape1 = pyro.param("shape1", torch.tensor(1.0).cuda(),
                        constraint=constraints.positive)
    shape2 = pyro.param("shape2", torch.tensor(1.0).cuda(),
                        constraint=constraints.positive)

    rate1 = pyro.param("rate1", torch.tensor(0.5).cuda(),
                        constraint=constraints.positive)
    rate2 = pyro.param("rate2", torch.tensor(1.0).cuda(),
                        constraint=constraints.positive)

    alpha = pyro.sample("alpha", dist.Gamma(shape1, rate1))
    beta = pyro.sample("beta", dist.Gamma(shape2, rate2))

    with pyro.iarange('observed_data_inter_coins', len_dataset, data.size(1), use_cuda=True):
        f = pyro.sample("latent_fairness",
            dist.Beta(alpha, beta).expand_by([data.size(1)]))

# setup the optimizer
adam_params = {"lr": 0.5, "betas": (0.90, 0.999)}
optimizer = Adam(adam_params)

# setup the inference algorithm
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

# do gradient steps
def collate_fn(x):
    x = data.dataloader.default_collate(x)
    x = x.transpose(0, 1)
    x = torch.Tensor.contiguous(x)
    return x

for step in range(200):
    if step == 10:
        adam_params = {"lr": 0.05, "betas": (0.90, 0.999)}
        optimizer = Adam(adam_params)

    if step == 50:
        adam_params = {"lr": 0.005, "betas": (0.90, 0.999)}
        optimizer = Adam(adam_params)

    if step == 100:
        adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
        optimizer = Adam(adam_params)

    sampler = data.DataLoader(dataset, batch_size=10_000, shuffle=True,
        drop_last=True, pin_memory=True, num_workers=1,
        collate_fn=collate_fn)
    for databat in sampler:
        svi.step(databat.cuda(async=True), len(dataset))

    #svi.step(databat)
    #print('.', end='', flush=True)
    print('Status of step', step)
    # grab the learned variational parameters
    shape1 = pyro.param("shape1").item()
    shape2 = pyro.param("shape2").item()
    rate1 = pyro.param("rate1").item()
    rate2 = pyro.param("rate2").item()

    print("mean of alpha", shape1 / rate1)
    print("mean of beta", shape2 / rate2)
    print("quotient of means", shape1 * rate2 / (rate1 * shape2))
    print("var of alpha", shape1 / rate1 ** 2)
    print("var of beta", shape2 / rate2 ** 2)