Configure to Use multiple GPUs

I have searched around this forum to finally get my model running on GPU, but I just realized it is only running on 1 GPU out of my total 8 ones. Is there any guide on how to enable all GPUs? Will the program run faster with all 8 GPUs? Here is my code:

import logging
import os

import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from torch.distributions import constraints

import pyro
import pyro.distributions as dist
import pyro.optim as optim
from scipy import stats
pyro.set_rng_seed(1)
RANDOM_SEED = 8927
np.random.seed(RANDOM_SEED)

# this default to create tensor on GPU, but only the first GPU in my case
torch.set_default_tensor_type('torch.cuda.FloatTensor')

# https://docs.pymc.io/notebooks/GLM-negative-binomial-regression.html

# Mean Poisson values
theta_noalcohol_meds = 1  # no alcohol, took an antihist
theta_alcohol_meds = 3  # alcohol, took an antihist
theta_noalcohol_nomeds = 6  # no alcohol, no antihist
theta_alcohol_nomeds = 36  # alcohol, no antihist

# Gamma shape parameter
alpha = 10


def get_nb_vals(mu, alpha, size):
    """Generate negative binomially distributed samples by
    drawing a sample from a gamma distribution with mean `mu` and
    shape parameter `alpha', then drawing from a Poisson
    distribution whose rate parameter is given by the sampled
    gamma variable.

    """

    g = stats.gamma.rvs(alpha, scale=mu / alpha, size=size)
    return stats.poisson.rvs(g)


# Create samples
n = 1000000
df = pd.DataFrame(
    {
        "nsneeze": np.concatenate(
            (
                get_nb_vals(theta_noalcohol_meds, alpha, n),
                get_nb_vals(theta_alcohol_meds, alpha, n),
                get_nb_vals(theta_noalcohol_nomeds, alpha, n),
                get_nb_vals(theta_alcohol_nomeds, alpha, n),
            )
        ),
        "alcohol": np.concatenate(
            (
                np.repeat(False, n),
                np.repeat(True, n),
                np.repeat(False, n),
                np.repeat(True, n),
            )
        ),
        "nomeds": np.concatenate(
            (
                np.repeat(False, n),
                np.repeat(False, n),
                np.repeat(True, n),
                np.repeat(True, n),
            )
        ),
    }
)


def model(nsneeze, alcohol, nomeds, al_no):
    b0 = pyro.sample("b0", dist.Normal(torch.Tensor([0.]).cuda(), torch.Tensor([10.]).cuda()))
    b1 = pyro.sample("b1", dist.Normal(torch.Tensor([0.]).cuda(), torch.Tensor([10.]).cuda()))
    b2 = pyro.sample("b2", dist.Normal(torch.Tensor([0.]).cuda(), torch.Tensor([10.]).cuda()))
    b3 = pyro.sample("b3", dist.Normal(torch.Tensor([0.]).cuda(), torch.Tensor([10.]).cuda()))
    
    phi = pyro.sample("phi", dist.HalfCauchy(torch.Tensor([2.5]).cuda()))
    
    theta = b0 + b1 * alcohol + b2 * nomeds + b3 * al_no
    mu = torch.exp(theta)
    alpha = phi
    beta = phi/mu
    
    with pyro.plate("data", len(nsneeze), use_cuda=True):
        pyro.sample("obs", dist.GammaPoisson(alpha, beta), obs=nsneeze)

train = torch.tensor(df.values, dtype=torch.float).cuda()
nsneeze, alcohol, nomeds, al_no = train[:, 0], train[:, 1], train[:, 2],  train[:, 3]

%%time
from pyro.infer.autoguide import AutoMultivariateNormal, init_to_mean
from pyro.infer import SVI, Trace_ELBO

num_iters = 10000
guide = AutoMultivariateNormal(model, init_loc_fn=init_to_mean)

svi = SVI(model,
          guide,
          optim.Adam({"lr": .001}),
          loss=Trace_ELBO())
pyro.clear_param_store()
loss = []
for i in range(num_iters):
    elbo = svi.step(nsneeze, alcohol, nomeds, al_no)
    loss.append(elbo)
    if i % 500 == 0:
        print("Elbo loss: {}".format(elbo))

As you can see, only 1 GPU is active:

generally speaking, usually it only makes sense to use multiple gpus is when you’re training very large neural networks with lots of parameters. that is not the case here

I think you can do this with horovod as in this example: https://pyro.ai/examples/svi_horovod.html

1 Like

I see. Thanks. I think even if I’m doing a regression with some amount of features a single GPU may still work. I may need to have more learnings on multiple GPU use cases.