How to identify the contributing parameters of a superposition?

fabifabifabi · August 9, 2024, 5:13pm

How can you identify the parameters for the mean and the standard deviation of the underlying distributions, if you only know the sum of these values?

Assuming you have random numbers with different contributers
x_i =y_i + a_i(0) z_{0}^{(i)}+ a_i(1) z_{1}^{(i)}
with a_i(k) \epsilon \{0,1\} defining, if z_{k}^{(i)} contributed.
Where y_i \sim N(\mu, \sigma) and z_{k}^{(i)} \sim N(\mu_{k}, \sigma_{k}). For each value x_i there was either z_{0} contributing or z_{1}.
And this one is only the minimal example. The far goal is to have more than one level of contributers and more than two contributers per level.

I am trying to use the Bayesian Regression Tutorial as baseline, but do not know hot to fit it to this context but I am struggling even with the basics.
How does the model need to look like in order to fit for the simple example with just two contributers to fit \mu, \mu_1, \mu_2 and \sigma, \sigma_1, \sigma_2?
And how to define it to fit a more generic context with more than one level and two or more contributers per level?

import numpy as np
import pandas as pd
import pyro
import pyro.distributions as dist
import pyro.optim as optim
import torch
import os
import matplotlib.pyplot as plt
import logging

%matplotlib inline
plt.style.use('default')

logging.basicConfig(format='%(message)s', level=logging.INFO)
smoke_test = ('CI' in os.environ)

def gen_data(gen_avg, gen_sig, n_samples, contributer_coefficients):
    """generate random data.
    
    Inputs:
    * gen_avg: mean for the baseline
    * gen_sig: sigma for the baseline
    * n_samples: number of samples
    * contributer_coefficients: dictionary with (mean,sig) per level per contributer
    
    Outputs:
    * data: array with the final number
    * contributers: matrix defining the contributers, first column is for the first level"""
    
    data = np.random.normal(gen_avg, gen_sig, n_samples)
    contributers = np.zeros((n_samples, len(contributer_coefficients)))
    for lvl, cdict in contributer_coefficients.items():
        print(f"creating level {lvl}")
        lvl_influencers = len(cdict) #number of influencers in this level
        lvldata = np.zeros((n_samples, lvl_influencers))

        for i, (mu,sig) in cdict.items():
            lvldata[:,i] = np.random.normal(mu, sig, n_samples)

        selection = np.random.randint(low=0,
                                    high=lvl_influencers,
                                    size=(n_samples))
        contributers[:, lvl] = selection
        
        data += np.array([lvldata[row, col] for row, col in enumerate(selection)])
    return torch.from_numpy(data), torch.from_numpy(contributers)

# parameters for the two contributors in level 0
cc_01 = {0: {    0: (1, 1),
                1: (2, 1)}}

# generate the data
data, contributers = gen_data(gen_avg=0,
                              gen_sig=10,
                              n_samples=1000,
                              contributer_coefficients=cc_01)

pyro.clear_param_store()
def model(contributers, data):
    
    a = pyro.sample("a", dist.Normal(0., 10.))
    b_0 = pyro.sample("b1", dist.Normal(0., 1.))
    b_1 = pyro.sample("b2", dist.Normal(0., 1.))

    sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
    mean = a + (contributers == 0) * b_0 + (contributers == 1) * b_1
    
    with pyro.plate("data", len(data)):
        pyro.sample("obs", dist.Normal(mean, sigma), obs=data)

from pyro.infer.autoguide import AutoMultivariateNormal, init_to_mean
from pyro.infer import SVI, Trace_ELBO

guide = AutoMultivariateNormal(model, init_loc_fn=init_to_mean)
svi = SVI(model,
          guide,
          optim.Adam({"lr": .01}),
          loss=Trace_ELBO())

num_iters = 100 if not smoke_test else 2

for i in range(num_iters):
    elbo = svi.step(contributers, data)
    if i % 500 == 0:
        print("Elbo loss: {}".format(elbo))

fabifabifabi · August 10, 2024, 9:37am

The model had the following issued

no latent variables, therefore the guide did not work → see model 4
wrong shape of contributers. It was [n,1] → use e.g. .squeeze(-1) to reduce to one-dimensional array

elbo = svi.step(contributers.squeeze(-1), data)

def model4(contributers, data):
    a_m = pyro.param("a_m", dist.Normal(0., 10.))
    a_s = pyro.param("a_s", dist.LogNormal(4, 10.))

    b_0_m = pyro.param("b_0_m", dist.Normal(0., 1.))
    b_0_s = pyro.param("b_0_s", dist.LogNormal(2., 1.))

    b_1_m = pyro.param("b_1_m", dist.Normal(0., 1.))
    b_1_s = pyro.param("b_1_s", dist.LogNormal(2., 1.))

    a = pyro.sample("a", dist.Normal(a_m, a_s))

    b_0 = pyro.sample("b_0", dist.Normal(b_0_m, b_0_s))

    b_1 = pyro.sample("b_1", dist.Normal(b_1_m, b_1_s))

    sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
    mean = a + (contributers == 0) * b_0 + (contributers == 1) * b_1
    
    with pyro.plate("data", len(data)):
        return pyro.sample("obs", dist.Normal(mean, sigma), obs=data)