Hello, I have been following the bayesian hierarchical tutorial here and this forum question here in trying to solve my problem but I cannot seem to figure out what I am doing wrong and I figure someone here would probably be able to figure it out.
For my example, let us assume that I am trying to model whether a student applies for a spot at a university. My data generating process looks like this.
import uuid import numpy as np import pandas as pd from scipy.special import expit, logit treatment_effect_magnitude = 0 n_students = 200 n_universities = 10 prob_treat = 0.5 intercept = -0.25 n_students = int(n_students) n_universities = int(n_universities) # create a list of students student_ids = [str(uuid.uuid4()) for _ in range(n_students)] # create a list of universities uni_ids = [str(uuid.uuid4()) for _ in range(n_universities)] # mu_i is the affinity of student i to apply for any uni mu_i = np.random.normal(0, 0.2, n_students) # zeta_j is the affinity of uni j to receive an apply from any student zeta_j = np.random.normal(0, 0.1, n_universities) # assign users to a flight treatment = np.random.binomial(1, prob_treat, n_students) data =  for student_i, student_id in enumerate(student_ids): for uni_j, uni_id in enumerate(uni_ids): # generate linear predictor eta = intercept + treatment_effect_magnitude * treatment[student_i] + mu_i[student_i] + zeta_j[uni_j] # generate the probability that the student will apply for uni j prob_apply = expit(eta) # generate outcome y = np.random.binomial(1, prob_apply, 1) # store row of data for later model fitting data.append(dict( y = y, treatment = 'control' if treatment[student_i] == 0 else 'treatment', student_id = student_id, uni_id = uni_id, )) data = pd.DataFrame(data)
I can then fit it using
bambi and it looks like the correct answer, like so.
import bambi import arviz as az formula = 'y ~ 0 + treatment + (1|uni_id) + (1|student_id)' model = bambi.Model(formula, data, family = 'bernoulli') # build graph model.build() model.graph() fitted_model= model.fit( method = 'mcmc', draws = 1000, tune = 1000, cores = 4, chains = 4, include_mean = False, omit_offsets = False ) # cut down list of variables summary az.summary(fitted_model, var_names = ['treatment','1|uni_id_sigma','1|student_id_sigma']) # full list of variables az.summary(fitted_model)
but when I try to fit with numpyro I am getting very different results and I think I am doing something wrong. I suspect I have messed something up with specifying the random component or I am encoding the variables incorrectly. If you could help it would be appreciated.My model is below.
import numpyro from numpyro.infer import MCMC, NUTS, Predictive import numpyro.distributions as dist from jax import random from sklearn.preprocessing import LabelEncoder assert numpyro.__version__.startswith("0.9.0") # define model def numpyro_model(student_id, uni_id, treatment_id, y_obs): sigma_student = numpyro.sample("sigma_student", dist.HalfNormal(5)) sigma_uni = numpyro.sample("sigma_uni", dist.HalfNormal(5)) unique_student_ids = np.unique(student_id) n_students = len(unique_student_ids) unique_uni_IDs = np.unique(uni_id) n_unis = len(unique_uni_IDs) unique_treatment_IDs = np.unique(treatment_id) n_treatment = len(unique_treatment_IDs) with numpyro.plate("plate_i", n_students): offset_student = numpyro.sample("offset_student", dist.Normal(0.0, 1.0)) alpha_student_i = offset_student * sigma_student with numpyro.plate("plate_j", n_unis): offset_uni = numpyro.sample("offset_uni", dist.Normal(0.0, 1.0)) beta_uni_j = offset_uni * sigma_uni with numpyro.plate("plate_f", n_treatment): treatment_k = numpyro.sample("treatment_k", dist.Normal(0, 5)) eta = alpha_student_i[student_id] + beta_uni_j[uni_id] + treatment_k[treatment_id] with numpyro.plate("data", len(y_obs)): y_est = numpyro.sample("y_est", dist.Bernoulli(logits=eta), obs=y_obs) # run model le = LabelEncoder() data["student_id_encoded"] = le.fit_transform(data["student_id"].values) data["uni_id_encoded"] = le.fit_transform(data["uni_id"].values) data["treatment_id_encoded"] = le.fit_transform(data["treatment"].values) y_obs = data["y"].values uni_id_encoded = data["uni_id_encoded"].values student_id_encoded = data["student_id_encoded"].values treatment_id_encoded = data["treatment_id_encoded"].values nuts_kernel = NUTS(numpyro_model) mcmc = MCMC(nuts_kernel, num_samples=3000, num_warmup=3000, num_chains = 4) rng_key = random.PRNGKey(0) mcmc.run(rng_key, student_id_encoded, uni_id_encoded, treatment_id_encoded,y_obs) posterior_samples = mcmc.get_samples() # summary of results. import arviz as az data_az = az.from_numpyro(mcmc) az.summary(data_az)
so if you could help that would be appreciated. I am also trying to figure out how I can increase the speed of the fitting process which is why I am trying to get it to work on numpyro instead of bambi (pymc3). It is okay at the example data size but when I am working with my real data set it is extremely slow, so if you have any tips for speed increases that would also be appreciated.