CUDA error: device-side assert triggered


#1

I am getting the following error. It says CUDA error:device-side assert triggered. The images and labels are dumped in the GPU but the parameters to be learnt in the bayesian inference are about 1.7 million ! Is the error due to the fact that the memory is unavailable to store these many parameters?

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing…
Done!
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:70: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:71: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:72: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:73: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

RuntimeError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pyro/poutine/trace_messenger.py in call(self, *args, **kwargs)
146 try:
–> 147 ret = self.fn(*args, **kwargs)
148 except (ValueError, RuntimeError):

in guide(self, images, labels)
74 cn1_dist=dist.Categorical(pcn1_param)
—> 75 print(cn1_dist.sample())
76 cn2_dist=dist.Categorical(pcn2_param)

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in repr(self)
65 if sys.version_info > (3,):
—> 66 return torch._tensor_str._str(self)
67 else:

/usr/local/lib/python3.6/dist-packages/torch/_tensor_str.py in _str(self)
276 suffixes.append(‘dtype=’ + str(self.dtype))
–> 277 tensor_str = _tensor_str(self, indent)
278

/usr/local/lib/python3.6/dist-packages/torch/_tensor_str.py in _tensor_str(self, indent)
194 self = self.float()
–> 195 formatter = _Formatter(get_summarized_data(self) if summarize else self)
196 return _tensor_str_with_formatter(self, indent, formatter, summarize)

/usr/local/lib/python3.6/dist-packages/torch/_tensor_str.py in init(self, tensor)
79 for value in tensor_view:
—> 80 value_str = ‘{}’.format(value)
81 self.max_width = max(self.max_width, len(value_str))

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in format(self, format_spec)
377 if self.dim() == 0:
–> 378 return self.item().format(format_spec)
379 return object.format(self, format_spec)

RuntimeError: CUDA error: device-side assert triggered

During handling of the above exception, another exception occurred:

RuntimeError Traceback (most recent call last)
in ()
1 instance=Ternary_bayesian_neural_network(prior_cn1,prior_cn2,prior_fp1,prior_fp2,pcn1,pcn2,pfp1,pfp2)
----> 2 instance.do_inference()

in do_inference(self)
115 images=images.to(device)
116 labels=labels.to(device)
–> 117 elbo=svi.step(images,labels)
118 if i%50==0:
119 logging.info(“ELBO Loss{}”.format(elbo))

/usr/local/lib/python3.6/dist-packages/pyro/infer/svi.py in step(self, *args, **kwargs)
97 # get loss and compute gradients
98 with poutine.trace(param_only=True) as param_capture:
—> 99 loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
100
101 params = set(site[“value”].unconstrained()

/usr/local/lib/python3.6/dist-packages/pyro/infer/trace_elbo.py in loss_and_grads(self, model, guide, *args, **kwargs)
123 loss = 0.0
124 # grab a trace from the generator
–> 125 for model_trace, guide_trace in self._get_traces(model, guide, *args, **kwargs):
126 loss_particle, surrogate_loss_particle = self._differentiable_loss_particle(model_trace, guide_trace)
127 loss += loss_particle / self.num_particles

/usr/local/lib/python3.6/dist-packages/pyro/infer/elbo.py in _get_traces(self, model, guide, *args, **kwargs)
162 else:
163 for i in range(self.num_particles):
–> 164 yield self._get_trace(model, guide, *args, **kwargs)

/usr/local/lib/python3.6/dist-packages/pyro/infer/trace_elbo.py in _get_trace(self, model, guide, *args, **kwargs)
50 “”"
51 model_trace, guide_trace = get_importance_trace(
—> 52 “flat”, self.max_plate_nesting, model, guide, *args, **kwargs)
53 if is_validation_enabled():
54 check_if_enumerated(guide_trace)

/usr/local/lib/python3.6/dist-packages/pyro/infer/enum.py in get_importance_trace(graph_type, max_plate_nesting, model, guide, *args, **kwargs)
40 against it.
41 “”"
—> 42 guide_trace = poutine.trace(guide, graph_type=graph_type).get_trace(*args, **kwargs)
43 model_trace = poutine.trace(poutine.replay(model, trace=guide_trace),
44 graph_type=graph_type).get_trace(*args, **kwargs)

/usr/local/lib/python3.6/dist-packages/pyro/poutine/trace_messenger.py in get_trace(self, *args, **kwargs)
167 Calls this poutine and returns its trace instead of the function’s return value.
168 “”"
–> 169 self(*args, **kwargs)
170 return self.msngr.get_trace()

/usr/local/lib/python3.6/dist-packages/pyro/poutine/trace_messenger.py in call(self, *args, **kwargs)
151 six.reraise(exc_type,
152 exc_type(u"{}\n{}".format(exc_value, shapes)),
–> 153 traceback)
154 self.msngr.trace.add_node("_RETURN", name="_RETURN", type=“return”, value=ret)
155 return ret

/usr/local/lib/python3.6/dist-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.traceback is not tb:
–> 692 raise value.with_traceback(tb)
693 raise value
694 finally:

/usr/local/lib/python3.6/dist-packages/pyro/poutine/trace_messenger.py in call(self, *args, **kwargs)
145 args=args, kwargs=kwargs)
146 try:
–> 147 ret = self.fn(*args, **kwargs)
148 except (ValueError, RuntimeError):
149 exc_type, exc_value, traceback = sys.exc_info()

in guide(self, images, labels)
73 pfp2_param=pyro.param(“fp2_param”,torch.tensor(self.pfp2),constraint=constraints.simplex)
74 cn1_dist=dist.Categorical(pcn1_param)
—> 75 print(cn1_dist.sample())
76 cn2_dist=dist.Categorical(pcn2_param)
77 fp1_dist=dist.Categorical(pfp1_param)

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in repr(self)
64 # characters to replace unicode characters with.
65 if sys.version_info > (3,):
—> 66 return torch._tensor_str._str(self)
67 else:
68 if hasattr(sys.stdout, ‘encoding’):

/usr/local/lib/python3.6/dist-packages/torch/_tensor_str.py in _str(self)
275 if not has_default_dtype:
276 suffixes.append(‘dtype=’ + str(self.dtype))
–> 277 tensor_str = _tensor_str(self, indent)
278
279 if self.layout != torch.strided:

/usr/local/lib/python3.6/dist-packages/torch/_tensor_str.py in _tensor_str(self, indent)
193 if self.dtype is torch.float16:
194 self = self.float()
–> 195 formatter = _Formatter(get_summarized_data(self) if summarize else self)
196 return _tensor_str_with_formatter(self, indent, formatter, summarize)
197

/usr/local/lib/python3.6/dist-packages/torch/_tensor_str.py in init(self, tensor)
78 if not self.floating_dtype:
79 for value in tensor_view:
—> 80 value_str = ‘{}’.format(value)
81 self.max_width = max(self.max_width, len(value_str))
82

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in format(self, format_spec)
376 def format(self, format_spec):
377 if self.dim() == 0:
–> 378 return self.item().format(format_spec)
379 return object.format(self, format_spec)
380

RuntimeError: CUDA error: device-side assert triggered
Trace Shapes:
Param Sites:
pcn1_param 800 3
pcn2_param 51200 3
pfp1_param 524288 3
fp2_param 5120 3
Sample Sites:

the numbers on the left: 800,51200,524288,5120 are the number of weights in each layer and 3 is the number of paramters( of a multinomial distribution) per weight. Or is there any other possible mistake i did


#2

It is hard to look at the stack trace and conclusively say what’s going wrong. One suggestion for debugging would be to reduce the size of your network and see if you continue to see this issue. If you need help debugging, I would suggest posting a minimal example that reproduces this issue.I also find it suspect that the traceback shows an error while trying to serialize the tensor for printing, so maybe you should consider removing that print statement to get a cleaner error trace.