Using Self Attention Layer Instead of Pooling Layer giving weird error

Code for Attention

class SelfAttention(nn.Module):

def __init__(self,attention_size):
    super().__init__()
    self.layers = nn.ModuleList([
        LinearBlock(attention_size, 1, 0.)])

    self.softmax = nn.Softmax(dim=-1)

@staticmethod
def get_mask(attentions):
    """
    Construct mask for padded itemsteps, based on lengths
    """
    mask = Variable(torch.ones(attentions.size())).detach()
    mask = mask.to(DEVICE)
    return mask

def forward(self, input):
    raw_outputs, outputs = input
    output = outputs[-1].permute(1,0,2)
    bs, ml, _ = output.shape
    x = output.contiguous().view(bs*ml,-1)
    for l in self.layers:
        x = l(x)
        x = F.relu(x) #nn.Tanh()(x)

    x = x.view(bs, ml)
    scores = self.softmax(x)
     mask = self.get_mask(scores)
    masked_scores = scores * mask
    _sums = masked_scores.sum(-1, keepdim=True)
    scores = masked_scores.div(_sums)
    weighted = output * scores.unsqueeze(-1).expand_as(output)
    representations = weighted.sum(1).squeeze()
    return representations, raw_outputs, outputs


def get_rnn_classifier(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, qrnn=False):
            rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn)
            model = SequentialRNN(rnn_enc, SelfAttention(400))
             return model

I have attached the error

I figured self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs) is introducing these weights . I am not sure how to solve this . I have tried everything . It works perfectly with PoolingLayerClassifier . All I am doing is plug an Attenion Layer . Its very frustrating can anyone help ?

def lr_find(self, start_lr=1e-5, end_lr=10, wds=None, linear=False, **kwargs):

    self.save('tmp')
    layer_opt = self.get_layer_opt(start_lr, wds)
    self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), end_lr, linear=linear)
    self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs)
    self.load('tmp')
1 Like

@mkardas @jeremy @sebastianruder

1 Like