PyTorch.nn.Embedding causes "RuntimeError one of the variables needed for gradient computation has been modified by an inplace operation"

Exception has occurred: RuntimeError
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.LongTensor [35, 1]] is at version 2; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
loss = csis.step()

I keep getting this error, when I did ‘set_detect_anomaly=True’ I got this trace

char_class_samples, _ = sample_from_transformer(self.guide_format, X, "char_format", len(X))
all_prob = model.forward(input, trg)
output = self.transformer.forward(src.unsqueeze(1), trg.unsqueeze(1), src_mask, trg_mask)
embedded_src = self.enc_embed(src)
result = self.forward(*input, **kwargs)
self.norm_type, self.scale_grad_by_freq, self.sparse)
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

Here’s my Transformer handler

import torch
from const import *
from model.Transformer import Transformer
class TransformerHandler():

def __init__(self, encoder_vocab: list, decoder_vocab: list, decoder_sos_idx: int):
    super(TransformerHandler, self).__init__()
    self.encoder_dim = len(encoder_vocab)
    self.decoder_dim = len(decoder_vocab)
    self.decoder_sos_idx = decoder_sos_idx
    self.decoder_pad_idx = decoder_vocab.index(PAD)
    self.encoder_pad_idx = encoder_vocab.index(PAD)
    self.decoder_eos_idx = decoder_vocab.index(EOS)
   self.transformer = Transformer(self.encoder_dim, self.decoder_dim, self.encoder_pad_idx, self.decoder_pad_idx) 

def forward(self, src: torch.Tensor, trg: torch.Tensor): 
    src_mask = self.get_pad_mask(src, self.encoder_pad_idx) 
    trg_mask = self.get_pad_mask(trg, self.decoder_pad_idx) 
    output = self.transformer.forward(src.unsqueeze(1), trg.unsqueeze(1), src_mask, trg_mask) 
    return output 

def get_pad_mask(self, seq, pad_idx): 
    return (seq != pad_idx)

And here’s my transformer

import math
import torch
from torch import nn
from const import *
class Transformer(nn.Module):

def __init__(self, input_dim: int, output_dim: int, input_pad_idx: int, output_pad_idx: int, d_model: int = 512, num_head: int = 8, num_e_layer: int = 6, num_d_layer: int = 6, ff_dim: int = 2048, drop_out: float = 0.1):
        input_dim: Size of the vocab of the input
        output_dim: Size of the vocab for output
        num_head: Number of heads in mutliheaded attention models
        num_e_layer: Number of sub-encoder layers
        num_d_layer: Number of sub-decoder layers
        ff_dim: Dimension of feedforward network in mulihead models
        d_model: The dimension to embed input and output features into
        drop_out: The drop out percentage
    super(Transformer, self).__init__()
    self.transformer = nn.Transformer(d_model, num_head, num_e_layer, num_d_layer, ff_dim, drop_out)
    self.dec_embed = nn.Embedding(output_dim, d_model, padding_idx=output_pad_idx)
    self.enc_embed = nn.Embedding(input_dim, d_model, padding_idx=input_pad_idx)
    self.fc1 = nn.Linear(d_model, output_dim)
    self.softmax = nn.Softmax(dim=2)

def forward(self, src: torch.Tensor, trg: torch.Tensor, src_mask: torch.Tensor = None,
            trg_mask: torch.Tensor = None):
    embedded_src = self.enc_embed(src)
    embedded_trg = self.dec_embed(trg)
    output = self.transformer.forward(embedded_src, embedded_trg, src_mask, trg_mask)
    return self.softmax(self.fc1(output))

When I remove nn.embed it works. Why is this a problem for Pyro?

Hmm, this doesn’t seem like a Pyro error from your traceback or partial code, but it’s hard to say for sure without a runnable snippet that reproduces the error. What happens if you remove all Pyro inference code and just call backward with respect to some dummy loss like torch.norm on your model output when sampling from the prior?