Different results training with basic loop vs with Learner

Hello all. I am training a Recurrent Encoder-Decoder network to generate animation curves from speech.

To start off with I created a basic Pytorch loop to overfit a subsample of my data. This went well.

Annotation 2020-04-09 183206

I tried to do the same with a fastai Learner, and the results are very smooth and poor.
Annotation 2020-04-09 183523

Can somebody help me with understanding what modifications I need to make to create a simple Learner which will train my model?

Model:

from torch import nn

import torch.nn.functional as F

import torch

input_size = 40

hidden_size = 256

output_size = 1

CUDA = torch.cuda.is_available()

class EncoderRNN(nn.Module):

    def __init__(self, input_size, hidden_size):

        super(EncoderRNN, self).__init__()

        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)

        self.hidden = torch.zeros(1, bs, hidden_size).to("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, input, hidden):

        output = input.transpose(2,1)

        output, hidden = self.gru(output, hidden)

        return output, hidden

class DecoderRNN(nn.Module):

    def __init__(self, hidden_size, output_size):

        super(DecoderRNN, self).__init__()

        self.hidden_size = hidden_size

        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):

        output = F.relu(input)

        output, hidden = self.gru(output, hidden)

        output = self.linear(output)

        return output, hidden

class EDRNN(nn.Module):

    def __init__(self, input_size, output_size):

        super().__init__()

        # architecture

        self.enc = EncoderRNN(input_size, hidden_size)

        self.dec = DecoderRNN(hidden_size, output_size)

        self = self.cuda() if CUDA else self

    def forward(self, xb):

        enc_out, hidden = self.enc(x, self.enc.hidden)

        dec_out, hidden = self.dec(enc_out, hidden)

        return dec_out.squeeze(dim=2)

    def decode(self, x): # for inference

        pass

Pytorch loop:

model = EDRNN(input_size, output_size)

criterion = nn.L1Loss()

enc_optim = torch.optim.Adam(model.enc.parameters(), lr = 0.001)
dec_optim = torch.optim.Adam(model.dec.parameters(), lr = 0.001)


for epoch in range(0,1000):
    for x, y in train_dl:
        model.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        enc_optim.step()
        dec_optim.step()

print(loss)