Learn.lr_find() and learn.fit_one_cycle kill the kernel

MN.Pham · July 29, 2021, 6:34pm

Hello everyone,

I am trying to build myself a simple network that can count people trained with the mall dataset. The in put is a 640x480 photo and output is a same size density map.

My code is as following

def block(ni, nf, ks=3): return ConvLayer(ni, nf, ks, stride=1)
def get_model():
    return nn.Sequential(
    block(3, 32),
    nn.MaxPool2d(2),
    
    block(32, 64),
    nn.MaxPool2d(2),
        
    block(64, 128),
    nn.MaxPool2d(2),
        
    block(128, 512),
        
    nn.Upsample(scale_factor=2),
    block(512, 128),
        
    nn.Upsample(scale_factor=2),
    block(128, 64),
    
    nn.Upsample(scale_factor=2),
    block(64, 1),
    )

And this is my loss function:

def EuclidLoss(input, target):
    return (input-target).square().mean().item()

The summary shows all the information

but I could not train the model. I tried several times but the the error message always showed up.

The kernel appears to have died. It will restart automatically

The terminal windows showed nothing so I have no clue what happened.

I am running the notebook on my TUF laptop with an AMD chip and GTX 1650 GPU. Does some one know what is the reason and how to fix this? Many thank.

JackByte · July 29, 2021, 6:39pm

I googled it. Is it correct that you have a 4GB GPU? Then probably a batch size of 64 with the pretty big input size is making you run out of resources. Try to start with batch size 8 and see if you can increase it.

MN.Pham · July 29, 2021, 6:56pm

Hi Jack,

thank you for the reply. I changed the bs to 3, it did not fix the problem but I have more information now. The error was shown as follow:

AttributeError                            Traceback (most recent call last)
<ipython-input-28-d81c6bd29d71> in <module>
----> 1 learn.lr_find()

~/.local/lib/python3.8/site-packages/fastai/callback/schedule.py in lr_find(self, start_lr, end_lr, num_it, stop_div, show_plot, suggestions)
    220     n_epoch = num_it//len(self.dls.train) + 1
    221     cb=LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
--> 222     with self.no_logging(): self.fit(n_epoch, cbs=cb)
    223     if show_plot: self.recorder.plot_lr_find()
    224     if suggestions:

~/.local/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    209             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    210             self.n_epoch = n_epoch
--> 211             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    212 
    213     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    200         for epoch in range(self.n_epoch):
    201             self.epoch=epoch
--> 202             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    203 
    204     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    194 
    195     def _do_epoch(self):
--> 196         self._do_epoch_train()
    197         self._do_epoch_validate()
    198 

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    186     def _do_epoch_train(self):
    187         self.dl = self.dls.train
--> 188         self._with_events(self.all_batches, 'train', CancelTrainException)
    189 
    190     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    164     def all_batches(self):
    165         self.n_iter = len(self.dl)
--> 166         for o in enumerate(self.dl): self.one_batch(*o)
    167 
    168     def _do_one_batch(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    182         self.iter = i
    183         self._split(b)
--> 184         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    185 
    186     def _do_epoch_train(self):

~/.local/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/.local/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    171         if len(self.yb):
    172             self.loss_grad = self.loss_func(self.pred, *self.yb)
--> 173             self.loss = self.loss_grad.clone()
    174         self('after_loss')
    175         if not self.training or not len(self.yb): return

AttributeError: 'float' object has no attribute 'clone'

idraja · July 29, 2021, 6:58pm

You could also try a Resize transform before moving things to the GPU so that you might fit a larger batch with your memory constraints.