From what I can tell, to do text classification you call get_rnn_classifier
which creates the RNN part of the model plus the classifer. The classifier source code is as follows:
class LinearBlock(nn.Module):
def __init__(self, ni, nf, drop):
super().__init__()
self.lin = nn.Linear(ni, nf)
self.drop = nn.Dropout(drop)
self.bn = nn.BatchNorm1d(ni)
def forward(self, x): return self.lin(self.drop(self.bn(x)))
class PoolingLinearClassifier(nn.Module):
def __init__(self, layers, drops):
super().__init__()
self.layers = nn.ModuleList([
LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)])
def pool(self, x, bs, is_max):
f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
return f(x.permute(1,2,0), (1,)).view(bs,-1)
def forward(self, input):
raw_outputs, outputs = input
output = outputs[-1]
sl,bs,_ = output.size()
avgpool = self.pool(output, bs, False)
mxpool = self.pool(output, bs, True)
x = torch.cat([output[-1], mxpool, avgpool], 1)
for l in self.layers:
l_x = l(x)
x = F.relu(l_x)
return l_x, raw_outputs, outputs
To me this looks like dropout/BN/relu is being done on the final output layer. Isn’t this unusual? Why would you do these ops right before producing the log probabilities?
Apologies if this is answered somewhere in the video lectures, I just was poking through the source code trying to get an understanding of the model.