I have about 20k sentences that I’m trying to train a simple text CNN on, I’m using spacy to tokenize and pymagnitude to get word vectors, but for some reason my code takes forever (upwards of 1 hour) to complete a single epoch (on Colab, with the GPU accelerator). this is a pretty small model so it should train pretty fast - I don’t think it’s related to the model code, but I have a hunch that the dataloader/dataset combination is incredibly slow.
Is there anything I can do to speed it up?
class Lambda(nn.Module):
def __init__(self, func):
super().__init__()
self.func = func
def forward(self, x):
return self.func(x)
def preprocess(x):
'''Take a Data Sample and do some cleaniing/preprocessing'''
return x.view(-1,1,PAD_LEN, EMB_DIM).to(dev)
# return x.view(-1, 1, PAD_LEN, EMB_DIM) # -1 for variable batchsize, 1 for 1 input channel, PAD_LEN is Document LEN,
class text_cnn(nn.Module):
def __init__(self):
super().__init__()
self.preprocess = Lambda(preprocess)
self.conv1 = nn.Conv1d(in_channels=1, out_channels=NUM_KERNELS, kernel_size=(FILTERS[0], EMB_DIM))
self.conv2 = nn.Conv1d(in_channels=1, out_channels=NUM_KERNELS, kernel_size=(FILTERS[1], EMB_DIM))
self.conv3 = nn.Conv1d(in_channels=1, out_channels=NUM_KERNELS, kernel_size=(FILTERS[2], EMB_DIM))
self.dropout = nn.Dropout(DROPOUT)
self.pool1 = nn.MaxPool2d(kernel_size=(PAD_LEN-FILTERS[0] + 1, 1))
self.pool2 = nn.MaxPool2d(kernel_size=(PAD_LEN-FILTERS[1] + 1, 1))
self.pool3 = nn.MaxPool2d(kernel_size=(PAD_LEN-FILTERS[2] + 1, 1))
self.output = nn.Linear(3*NUM_KERNELS, TOTAL_OUTPUT_CLASSES)
def forward(self, xb):
xb = self.preprocess(xb)
trigram = self.pool1(F.relu(self.conv1(xb)))
quadgram = self.pool2(F.relu(self.conv2(xb)))
pentagram = self.pool3(F.relu(self.conv3(xb)))
xb = torch.cat((trigram, quadgram, pentagram),1).squeeze()
xb = self.dropout(xb)
return self.output(xb)
def collect_multi_labels(Series):
return set(Series.tolist())
class TextDataset(Dataset):
def __init__(self, csv_file):
self.dataset = pd.read_csv(csv_file).groupby('comment').agg(collect_multi_labels).reset_index()
self.dataset['tokens'] = self.dataset.comment.map(nlp)
self.vectors = Magnitude("./vectors.magnitude", pad_to_length=PAD_LEN)
self.mlb = MultiLabelBinarizer()
self.mlb.fit(self.dataset['mot'].tolist())
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
row = self.dataset.iloc[idx, :]
comment, label = row.tokens, row.mot
label = self.mlb.transform([label])[0]
return self.vectors.query([token.text for token in comment if token.text]), torch.Tensor(label).float().to(dev)
dataset = TextDataset(data_path)
VALIDATION_SET = 0.3
SHUFFLE_DATASET = True
RANDOM_SEED = 42
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(VALIDATION_SET * dataset_size))
np.random.seed(RANDOM_SEED)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
sampler=valid_sampler)
def get_model():
model = text_cnn().to(dev)
opt = optim.Adam(model.parameters(), lr=1e-4)
return model, opt
def loss_batch(model, loss_func, xb, yb, opt=None):
loss = loss_func(model(xb), yb.squeeze())
if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
def fit(model, train_dl, valid_dl):
loss_func = nn.BCEWithLogitsLoss()
for epoch in range(EPOCHS):
count = 0
for xb, yb in train_dl:
model.train()
loss_batch(model, loss_func, xb, yb, opt)
print(count)
model.eval()
with torch.no_grad():
losses, nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl])
val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
print(f'epoch # {epoch} ', val_loss)