How can you identify the parameters for the mean and the standard deviation of the underlying distributions, if you only know the sum of these values?
Assuming you have random numbers with different contributers
x_i =y_i + a_i(0) z_{0}^{(i)}+ a_i(1) z_{1}^{(i)}
with a_i(k) \epsilon \{0,1\} defining, if z_{k}^{(i)} contributed.
Where y_i \sim N(\mu, \sigma) and z_{k}^{(i)} \sim N(\mu_{k}, \sigma_{k}). For each value x_i there was either z_{0} contributing or z_{1}.
And this one is only the minimal example. The far goal is to have more than one level of contributers and more than two contributers per level.
I am trying to use the Bayesian Regression Tutorial as baseline, but do not know hot to fit it to this context but I am struggling even with the basics.
How does the model need to look like in order to fit for the simple example with just two contributers to fit \mu, \mu_1, \mu_2 and \sigma, \sigma_1, \sigma_2?
And how to define it to fit a more generic context with more than one level and two or more contributers per level?
import numpy as np
import pandas as pd
import pyro
import pyro.distributions as dist
import pyro.optim as optim
import torch
import os
import matplotlib.pyplot as plt
import logging
%matplotlib inline
plt.style.use('default')
logging.basicConfig(format='%(message)s', level=logging.INFO)
smoke_test = ('CI' in os.environ)
def gen_data(gen_avg, gen_sig, n_samples, contributer_coefficients):
"""generate random data.
Inputs:
* gen_avg: mean for the baseline
* gen_sig: sigma for the baseline
* n_samples: number of samples
* contributer_coefficients: dictionary with (mean,sig) per level per contributer
Outputs:
* data: array with the final number
* contributers: matrix defining the contributers, first column is for the first level"""
data = np.random.normal(gen_avg, gen_sig, n_samples)
contributers = np.zeros((n_samples, len(contributer_coefficients)))
for lvl, cdict in contributer_coefficients.items():
print(f"creating level {lvl}")
lvl_influencers = len(cdict) #number of influencers in this level
lvldata = np.zeros((n_samples, lvl_influencers))
for i, (mu,sig) in cdict.items():
lvldata[:,i] = np.random.normal(mu, sig, n_samples)
selection = np.random.randint(low=0,
high=lvl_influencers,
size=(n_samples))
contributers[:, lvl] = selection
data += np.array([lvldata[row, col] for row, col in enumerate(selection)])
return torch.from_numpy(data), torch.from_numpy(contributers)
# parameters for the two contributors in level 0
cc_01 = {0: { 0: (1, 1),
1: (2, 1)}}
# generate the data
data, contributers = gen_data(gen_avg=0,
gen_sig=10,
n_samples=1000,
contributer_coefficients=cc_01)
pyro.clear_param_store()
def model(contributers, data):
a = pyro.sample("a", dist.Normal(0., 10.))
b_0 = pyro.sample("b1", dist.Normal(0., 1.))
b_1 = pyro.sample("b2", dist.Normal(0., 1.))
sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
mean = a + (contributers == 0) * b_0 + (contributers == 1) * b_1
with pyro.plate("data", len(data)):
pyro.sample("obs", dist.Normal(mean, sigma), obs=data)
from pyro.infer.autoguide import AutoMultivariateNormal, init_to_mean
from pyro.infer import SVI, Trace_ELBO
guide = AutoMultivariateNormal(model, init_loc_fn=init_to_mean)
svi = SVI(model,
guide,
optim.Adam({"lr": .01}),
loss=Trace_ELBO())
num_iters = 100 if not smoke_test else 2
for i in range(num_iters):
elbo = svi.step(contributers, data)
if i % 500 == 0:
print("Elbo loss: {}".format(elbo))