I tried to play around with the tutorial example as follows, if I run it on CPU (`use_cuda = False`

) the elapsed time is rough 53s but It takes 153s running on GPU (`use_cuda = True`

), so what’s wrong with my implementation? Thanks.

pyro version 0.1.2

pytorch version 0.3

```
import torch
from torch.autograd import Variable
import pyro
import pyro.distributions as dist
import pyro.optim as optim
import pyro.infer as infer
import time
pyro.clear_param_store()
data = []
for _ in range(60):
data.append(1)
for _ in range(4):
data.append(0)
use_cuda = False
if use_cuda == True:
data = Variable(torch.Tensor(data)).view(-1, 1).cuda()
else:
data = Variable(torch.Tensor(data)).view(-1, 1)
def model(data):
α = Variable(torch.Tensor([10.0])).type_as(data)
β = Variable(torch.Tensor([10.0])).type_as(data)
# prior distribution
θ = pyro.sample("θ", dist.beta, α, β)
# generating observations
with pyro.iarange("observed data", use_cuda=use_cuda):
pyro.sample("obs", dist.bernoulli, θ, obs=data)
def guide(data):
log_α = Variable(torch.Tensor([2.7]), requires_grad=True)
log_β = Variable(torch.Tensor([2.7]), requires_grad=True)
log_α = pyro.param("log alpha", log_α).type_as(data)
log_β = pyro.param("log beta", log_β).type_as(data)
α, β = torch.exp(log_α), torch.exp(log_β)
pyro.sample("θ", dist.beta, α, β)
adam_params = {"lr": 0.001}
optimizer = optim.Adam(adam_params)
svi = infer.SVI(model, guide, optimizer, loss="ELBO", num_particles=5)
tic = time.time()
n_steps = 4000
for step in range(n_steps):
loss = svi.step(data)
if step % 100 == 0:
print("loss = {}".format(loss))
print("time passed {}".format(time.time()-tic))
α = torch.exp(pyro.param("log alpha")).data.numpy()[0]
β = torch.exp(pyro.param("log beta")).data.numpy()[0]
print(α, β)
print(α / (α + β))
```