NaN error with timm library and fp16 training (seresnet152d_320)

sophia · January 14, 2021, 10:25am

Hi, so after some epochs (sometimes also more or less directly after beginning of the training) I get the following error:

It does not happen for all models. For example I never get this error for an efficientnet b2a from the timm library. I am using pretrained models and then fine-tuning.

Any suggestions what I can do differently? - will first update fastai and fastcore, but this is probably not the problem?
(Using fastai’s to fp16).

Edit: I am also using a pytorch nightly version torch-1.8.0.dev

epoch train_loss valid_loss accuracy roc_auc_score time
0 1.654639 1.221462 0.476364 0.649210 02:23
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.196939 1.325333 0.380000 0.565354 02:22
1 1.093605 1.095448 0.407273 0.627587 02:22
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.595023 4.053803 0.298182 0.472930 02:21
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.255504 02:21
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
57 learn = Learner(dls, model, loss_func=LabelSmoothingCrossEntropy(), metrics=[accuracy,RocAuc()]).to_fp16()
58 learn.freeze()
—> 59 learn.fine_tune(n_epochs, wd=wd, cbs=[mixup])
60
61 a = MetricsPerCase()

~/fastai2/lib/python3.7/site-packages/fastai/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    158     base_lr /= 2
    159     self.unfreeze()
--> 160     self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)
    161 
    162 # Cell

~/fastai2/lib/python3.7/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    110     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    111               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    113 
    114 # Cell

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    204             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    205             self.n_epoch = n_epoch
--> 206             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    207 
    208     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _do_epoch(self)
    190     def _do_epoch(self):
    191         self._do_epoch_train()
--> 192         self._do_epoch_validate()
    193 
    194     def _do_fit(self):

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _do_epoch_validate(self, ds_idx, dl)
    186         if dl is None: dl = self.dls[ds_idx]
    187         self.dl = dl
--> 188         with torch.no_grad(): self._with_events(self.all_batches, 'validate', CancelValidException)
    189 
    190     def _do_epoch(self):

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
--> 157         finally:   self(f'after_{event_type}')        ;final()
    158 
    159     def all_batches(self):

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in __call__(self, event_name)
    131     def ordered_cbs(self, event): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, event)]
    132 
--> 133     def __call__(self, event_name): L(event_name).map(self._call_one)
    134 
    135     def _call_one(self, event_name):

~/fastai2/lib/python3.7/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
    152     def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
    153 
--> 154     def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
    155     def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
    156     def filter(self, f=noop, negate=False, gen=False, **kwargs):

~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
    639     res = map(g, iterable)
    640     if gen: return res
--> 641     return list(res)
    642 
    643 # Cell

~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
    629             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    630         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 631         return self.func(*fargs, **kwargs)
    632 
    633 # Cell

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _call_one(self, event_name)
    135     def _call_one(self, event_name):
    136         assert hasattr(event, event_name), event_name
--> 137         [cb(event_name) for cb in sort_by_run(self.cbs)]
    138 
    139     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in <listcomp>(.0)
    135     def _call_one(self, event_name):
    136         assert hasattr(event, event_name), event_name
--> 137         [cb(event_name) for cb in sort_by_run(self.cbs)]
    138 
    139     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

~/fastai2/lib/python3.7/site-packages/fastai/callback/core.py in __call__(self, event_name)
     42                (self.run_valid and not getattr(self, 'training', False)))
     43         res = None
---> 44         if self.run and _run: res = getattr(self, event_name, noop)()
     45         if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
     46         return res

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in after_validate(self)
    471     def before_validate(self): self._valid_mets.map(Self.reset())
    472     def after_train   (self): self.log += self._train_mets.map(_maybe_item)
--> 473     def after_validate(self): self.log += self._valid_mets.map(_maybe_item)
    474     def after_cancel_train(self):    self.cancel_train = True
    475     def after_cancel_validate(self): self.cancel_valid = True

~/fastai2/lib/python3.7/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
    152     def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
    153 
--> 154     def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
    155     def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
    156     def filter(self, f=noop, negate=False, gen=False, **kwargs):

~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
    639     res = map(g, iterable)
    640     if gen: return res
--> 641     return list(res)
    642 
    643 # Cell

~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
    629             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    630         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 631         return self.func(*fargs, **kwargs)
    632 
    633 # Cell

~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _maybe_item(t)
    425 # Cell
    426 def _maybe_item(t):
--> 427     t = t.value
    428     try: return t.item()
    429     except: return t

~/fastai2/lib/python3.7/site-packages/fastai/metrics.py in value(self)
     67         preds,targs = torch.cat(self.preds),torch.cat(self.targs)
     68         if self.to_np: preds,targs = preds.numpy(),targs.numpy()
---> 69         return self.func(targs, preds, **self.kwargs) if self.invert_args else self.func(preds, targs, **self.kwargs)
     70 
     71     @property

~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/fastai2/lib/python3.7/site-packages/sklearn/metrics/_ranking.py in roc_auc_score(y_true, y_score, average, sample_weight, max_fpr, multi_class, labels)
    522     y_type = type_of_target(y_true)
    523     y_true = check_array(y_true, ensure_2d=False, dtype=None)
--> 524     y_score = check_array(y_score, ensure_2d=False)
    525 
    526     if y_type == "multiclass" or (y_type == "binary" and

~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    662         if force_all_finite:
    663             _assert_all_finite(array,
--> 664                                allow_nan=force_all_finite == 'allow-nan')
    665 
    666     if ensure_min_samples > 0:

~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
    104                     msg_err.format
    105                     (type_err,
--> 106                      msg_dtype if msg_dtype is not None else X.dtype)
    107             )
    108     # for object dtype data, we only check for NaNs (GH-13254)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

edit: I switched to the built “timm learner” https://github.com/walkwithfastai/walkwithfastai.github.io/blob/master/nbs/02_vision.external.timm.ipynb

and I also updated fastai and fastcore and everything is working now!

machinethink · January 14, 2021, 1:45pm

Your learning rate is probably too high.