Pytorch assertion error on loss.backward()

This is not related to the course or fastai library(sorry if this is out of scope). I am trying to train a pytorch GRU model on my own dataset but running into this assertion error at the point of loss.backward(). Here is my code with the error. Any help is appreciated.

class AttendResistance(nn.Module):
       def __init__(self, nb_classes, nb_tokens, embedding_matrix, 
   embed_dropout_rate=0, final_dropout_rate=0, return_attention=False):
        super(AttendResistance, self).__init__()
    embedding_dim = 20
    hidden_size = 32

    self.embed_dropout_rate = embed_dropout_rate
    self.final_dropout_rate = final_dropout_rate
    self.return_attention = return_attention
    self.hidden_size = hidden_size
    self.nb_classes = nb_classes

    self.embed = nn.Embedding(nb_tokens, embedding_dim)
    self.embed.weight = nn.Parameter(embedding_matrix)
    self.embed_dropout = nn.Dropout2d(embed_dropout_rate)
    self.gru = nn.GRU(embedding_dim, hidden_size, num_layers = 1, batch_first=True, dropout = 0.5, 
                    bidirectional=False)
    self.final_drop = nn.Dropout(final_dropout_rate)
    self.linear = nn.Linear(hidden_size, nb_classes)
    self.softmax = nn.Softmax(dim = 1)

def forward(self, input_seqs):
    print (input_seqs.size())
    x = self.embed(input_seqs)
    print (x.size())
    x = nn.Tanh()(x)
    print (x.size())
    x = self.embed_dropout(x)
    print (x.size())
    x, _ = self.gru(x)  
    print (x.size())      
    x = self.final_drop(x)
    print (x.size())
    x = self.linear(x[:, -1, :].float())
    print (x.size())
    outputs = self.softmax(x)
    print (outputs.size())

    if self.return_attention:
        return outputs, att_weights
    else:
        return outputs

attn_res = AttendResistance(268, 20, embedding_matrix, 0.5, 0.3, True)
attn_res = attn_res.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(attn_res.parameters())

num_epochs = 10
for epoch in range(num_epochs):
    for i, (prot_seqs, labels) in enumerate(train_loader):
        prot_seqs = Variable(prot_seqs.long()).cuda()
        labels = Variable(labels.long()).cuda()

        #print (prot_seqs)
        #print (labels)
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs, att_weights = attn_res(prot_seqs)
        print (outputs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        print (loss)
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(X_train)//batch_size, loss.data[0]))

And here is the error with the print output:

torch.Size([64, 1602])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 32])
torch.Size([64, 1602, 32])
torch.Size([64, 268])
torch.Size([64, 268])
Variable containing:
1.00000e-03 *
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
          ...             ⋱             ...          
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
[torch.cuda.FloatTensor of size 64x268 (GPU 0)]

Variable containing:
 5.5909
[torch.cuda.FloatTensor of size 1 (GPU 0)]


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-89-a32cf2edb4cc> in <module>()
     17         print (torch.sum(att_weights))
     18         print (loss)
---> 19         loss.backward()
     20         optimizer.step()
     21 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
    165                 Variable.
    166         """
--> 167         torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
    168 
    169     def register_hook(self, hook):

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
     97 
     98     Variable._execution_engine.run_backward(
---> 99         variables, grad_variables, retain_graph)
    100 
    101 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
    333     def _do_backward(self, gradients, retain_variables):
    334         self.retain_variables = retain_variables
--> 335         result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
    336         if not retain_variables:
    337             del self._nested_output

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
    341     def backward(self, *gradients):
    342         nested_gradients = _unflatten(gradients, self._nested_output)
--> 343         result = self.backward_extended(*nested_gradients)
    344         return tuple(_iter_None_tensors(result))
    345 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
    333                 output,
    334                 weight,
--> 335                 grad_weight)
    336         else:
    337             grad_weight = [(None,) * len(layer_weight) for layer_weight in weight]

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_weight(fn, input, hx, output, weight, grad_weight)
    466 
    467         # copy the weights from the weight_buf into grad_weight
--> 468         grad_params = get_parameters(fn, handle, dw)
    469         _copyParams(grad_params, grad_weight)
    470         return grad_weight

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in get_parameters(fn, handle, weight_buf)
    169                     layer_params.append(param)
    170                 else:
--> 171                     assert cur_offset == offset
    172 
    173                 cur_offset = offset + filter_dim_a[0]

AssertionError: 

Because the error is not giving me any explicit message, I don’t know what am I doing wrong here. I am aware it is an assertion error that is happening. But I don’t know what are these cur_offset and offset variables are. Running on,

0.3.0.post4
Cuda compilation tools, release 8.0, V8.0.61

Have you tried the solution posted here

just find if I added .float() in the input_sentence_var=Variable(…float()), then it would work.

I have seen this from search. The solution seems not clear to me, I could not find where is the input_sentence_var on that code, so that I can find the similar variable on my code.