Hello, I have been following the bayesian hierarchical tutorial here and this forum question here in trying to solve my problem but I cannot seem to figure out what I am doing wrong and I figure someone here would probably be able to figure it out.
For my example, let us assume that I am trying to model whether a student applies for a spot at a university. My data generating process looks like this.
import uuid
import numpy as np
import pandas as pd
from scipy.special import expit, logit
treatment_effect_magnitude = 0
n_students = 200
n_universities = 10
prob_treat = 0.5
intercept = -0.25
n_students = int(n_students)
n_universities = int(n_universities)
# create a list of students
student_ids = [str(uuid.uuid4()) for _ in range(n_students)]
# create a list of universities
uni_ids = [str(uuid.uuid4()) for _ in range(n_universities)]
# mu_i is the affinity of student i to apply for any uni
mu_i = np.random.normal(0, 0.2, n_students)
# zeta_j is the affinity of uni j to receive an apply from any student
zeta_j = np.random.normal(0, 0.1, n_universities)
# assign users to a flight
treatment = np.random.binomial(1, prob_treat, n_students)
data = []
for student_i, student_id in enumerate(student_ids):
for uni_j, uni_id in enumerate(uni_ids):
# generate linear predictor
eta = intercept + treatment_effect_magnitude * treatment[student_i] + mu_i[student_i] + zeta_j[uni_j]
# generate the probability that the student will apply for uni j
prob_apply = expit(eta)
# generate outcome
y = np.random.binomial(1, prob_apply, 1)[0]
# store row of data for later model fitting
data.append(dict(
y = y,
treatment = 'control' if treatment[student_i] == 0 else 'treatment',
student_id = student_id,
uni_id = uni_id,
))
data = pd.DataFrame(data)
I can then fit it using bambi
and it looks like the correct answer, like so.
import bambi
import arviz as az
formula = 'y ~ 0 + treatment + (1|uni_id) + (1|student_id)'
model = bambi.Model(formula, data, family = 'bernoulli')
# build graph
model.build()
model.graph()
fitted_model= model.fit(
method = 'mcmc',
draws = 1000,
tune = 1000,
cores = 4,
chains = 4,
include_mean = False,
omit_offsets = False
)
# cut down list of variables summary
az.summary(fitted_model, var_names = ['treatment','1|uni_id_sigma','1|student_id_sigma'])
# full list of variables
az.summary(fitted_model)
but when I try to fit with numpyro I am getting very different results and I think I am doing something wrong. I suspect I have messed something up with specifying the random component or I am encoding the variables incorrectly. If you could help it would be appreciated.My model is below.
import numpyro
from numpyro.infer import MCMC, NUTS, Predictive
import numpyro.distributions as dist
from jax import random
from sklearn.preprocessing import LabelEncoder
assert numpyro.__version__.startswith("0.9.0")
# define model
def numpyro_model(student_id, uni_id, treatment_id, y_obs):
sigma_student = numpyro.sample("sigma_student", dist.HalfNormal(5))
sigma_uni = numpyro.sample("sigma_uni", dist.HalfNormal(5))
unique_student_ids = np.unique(student_id)
n_students = len(unique_student_ids)
unique_uni_IDs = np.unique(uni_id)
n_unis = len(unique_uni_IDs)
unique_treatment_IDs = np.unique(treatment_id)
n_treatment = len(unique_treatment_IDs)
with numpyro.plate("plate_i", n_students):
offset_student = numpyro.sample("offset_student", dist.Normal(0.0, 1.0))
alpha_student_i = offset_student * sigma_student
with numpyro.plate("plate_j", n_unis):
offset_uni = numpyro.sample("offset_uni", dist.Normal(0.0, 1.0))
beta_uni_j = offset_uni * sigma_uni
with numpyro.plate("plate_f", n_treatment):
treatment_k = numpyro.sample("treatment_k", dist.Normal(0, 5))
eta = alpha_student_i[student_id] + beta_uni_j[uni_id] + treatment_k[treatment_id]
with numpyro.plate("data", len(y_obs)):
y_est = numpyro.sample("y_est", dist.Bernoulli(logits=eta), obs=y_obs)
# run model
le = LabelEncoder()
data["student_id_encoded"] = le.fit_transform(data["student_id"].values)
data["uni_id_encoded"] = le.fit_transform(data["uni_id"].values)
data["treatment_id_encoded"] = le.fit_transform(data["treatment"].values)
y_obs = data["y"].values
uni_id_encoded = data["uni_id_encoded"].values
student_id_encoded = data["student_id_encoded"].values
treatment_id_encoded = data["treatment_id_encoded"].values
nuts_kernel = NUTS(numpyro_model)
mcmc = MCMC(nuts_kernel, num_samples=3000, num_warmup=3000, num_chains = 4)
rng_key = random.PRNGKey(0)
mcmc.run(rng_key, student_id_encoded, uni_id_encoded, treatment_id_encoded,y_obs)
posterior_samples = mcmc.get_samples()
# summary of results.
import arviz as az
data_az = az.from_numpyro(mcmc)
az.summary(data_az)
so if you could help that would be appreciated. I am also trying to figure out how I can increase the speed of the fitting process which is why I am trying to get it to work on numpyro instead of bambi (pymc3). It is okay at the example data size but when I am working with my real data set it is extremely slow, so if you have any tips for speed increases that would also be appreciated.