Lesson 10. Weird behavior while training model with Adam optimizer

Hello everyone,

Could someone please help me to understand the root cause of the weird training results of the model from lesson 10? I have attached the source code.

from fastai.vision import *
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
from torch import nn
from torch.nn import init
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import torch.optim as optim
from functools import partial

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

def normalize_to(train, valid):
    m,s = train.mean(),train.std()
    return normalize(train, m, s), normalize(valid, m, s)

nfs = [8,16,32,32]

class m_Dataset(Dataset):
    def __init__(self, x, y): 
      self.x,self.y = x,y
      self.c = 10
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i],self.y[i]

def draw_get_data_bunch():
   x_train,y_train,x_valid,y_valid = get_data()
   x_train, x_valid = normalize_to(x_train, x_valid)
   x_train = mnist_resize(x_train)
   x_valid = mnist_resize(x_valid)
   train_ds,valid_ds = m_Dataset(x_train, y_train),m_Dataset(x_valid, y_valid)
   nh,bs = 50,512
   c = y_train.max().item()+1
   train_dl = DataLoader(train_ds, bs)
   valid_dl = DataLoader(valid_ds, bs)
   data = DataBunch(train_dl,valid_dl)
   
   return data

def mnist_resize(x):
  return x.view(-1,1,28,28)

def flatten(x): return x.view(x.shape[0], -1)

def alter_get_cnn_model():
    return nn.Sequential(
        nn.Conv2d( 1, 8, 5, padding=2,stride=2) ,nn.ReLU(), #14
        nn.Conv2d( 8,16, 3, padding=1,stride=2) ,nn.ReLU(), # 7
        nn.Conv2d(16,32, 3, padding=1,stride=2),nn.ReLU(), # 4
        nn.Conv2d(32,32, 3, padding=1,stride=2) ,nn.ReLU(), # 2
        nn.AdaptiveAvgPool2d(1),
        Lambda(flatten),
        nn.Linear(32,10)
    )

draw_data = draw_get_data_bunch()
draw_model = alter_get_cnn_model()

for l in draw_model:
    if isinstance(l, nn.Sequential):
        init.kaiming_normal_(l[0].weight)
        l[0].bias.data.zero_()
      

learn = Learner(draw_data, draw_model,loss_func = nn.CrossEntropyLoss(), opt_func=optim.SGD,metrics=[accuracy])
learn.fit(3, lr=0.6)

There are 3 training cases:

  1. Create the Learner with optim.SGD

learn = Learner(draw_data, draw_model,loss_func = nn.CrossEntropyLoss(), opt_func=optim.SGD,metrics=[accuracy])

The final accuracy is about 0.93 but results are not reproducible. It possible to get an accuracy of 0.1 for each 5th attempt

  1. Create the Learner without optim.SGD as

learn = Learner(draw_data, draw_model,loss_func = nn.CrossEntropyLoss(),metrics=[accuracy])

By default it should be the Adam optimizer. However the accuracy is the 0.1 and from epoch to epoch it doesn’t change. The model is not being trained.

  1. After adding the batch norm layers in case 2 the model is being trained and the final accuracy is 0.96 with reproducible results.
def alter_get_cnn_model():
    return nn.Sequential(
        nn.Conv2d( 1, 8, 5, padding=2,stride=2) ,nn.BatchNorm2d(8),nn.ReLU(), #14
        nn.Conv2d( 8,16, 3, padding=1,stride=2),nn.BatchNorm2d(16),nn.ReLU(), # 7
        nn.Conv2d(16,32, 3, padding=1,stride=2),nn.BatchNorm2d(32),nn.ReLU(), # 4
        nn.Conv2d(32,32, 3, padding=1,stride=2),nn.BatchNorm2d(32),nn.ReLU(), # 2
        nn.AdaptiveAvgPool2d(1),
        Lambda(flatten),
        nn.Linear(32,10)
    )

I can’t understand why in case 2 the model is stuck and how to find the root cause in similar situations. Which tools or visualization can help me to find the problem? Also, how to find the reason for the unstable results of case 1?

Thanks,