Hi, so after some epochs (sometimes also more or less directly after beginning of the training) I get the following error:
It does not happen for all models. For example I never get this error for an efficientnet b2a from the timm library. I am using pretrained models and then fine-tuning.
Any suggestions what I can do differently? - will first update fastai and fastcore, but this is probably not the problem?
(Using fastai’s to fp16).
Edit: I am also using a pytorch nightly version torch-1.8.0.dev
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.654639 1.221462 0.476364 0.649210 02:23
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.196939 1.325333 0.380000 0.565354 02:22
1 1.093605 1.095448 0.407273 0.627587 02:22
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.595023 4.053803 0.298182 0.472930 02:21
epoch train_loss valid_loss accuracy roc_auc_score time
0 1.255504 02:21
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
57 learn = Learner(dls, model, loss_func=LabelSmoothingCrossEntropy(), metrics=[accuracy,RocAuc()]).to_fp16()
58 learn.freeze()
—> 59 learn.fine_tune(n_epochs, wd=wd, cbs=[mixup])
60
61 a = MetricsPerCase()
~/fastai2/lib/python3.7/site-packages/fastai/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
158 base_lr /= 2
159 self.unfreeze()
--> 160 self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)
161
162 # Cell
~/fastai2/lib/python3.7/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
110 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
111 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
113
114 # Cell
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
204 self.opt.set_hypers(lr=self.lr if lr is None else lr)
205 self.n_epoch = n_epoch
--> 206 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
207
208 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
153
154 def _with_events(self, f, event_type, ex, final=noop):
--> 155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
157 finally: self(f'after_{event_type}') ;final()
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _do_fit(self)
195 for epoch in range(self.n_epoch):
196 self.epoch=epoch
--> 197 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
198
199 def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
153
154 def _with_events(self, f, event_type, ex, final=noop):
--> 155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
157 finally: self(f'after_{event_type}') ;final()
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _do_epoch(self)
190 def _do_epoch(self):
191 self._do_epoch_train()
--> 192 self._do_epoch_validate()
193
194 def _do_fit(self):
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _do_epoch_validate(self, ds_idx, dl)
186 if dl is None: dl = self.dls[ds_idx]
187 self.dl = dl
--> 188 with torch.no_grad(): self._with_events(self.all_batches, 'validate', CancelValidException)
189
190 def _do_epoch(self):
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
--> 157 finally: self(f'after_{event_type}') ;final()
158
159 def all_batches(self):
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in __call__(self, event_name)
131 def ordered_cbs(self, event): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, event)]
132
--> 133 def __call__(self, event_name): L(event_name).map(self._call_one)
134
135 def _call_one(self, event_name):
~/fastai2/lib/python3.7/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
152 def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
153
--> 154 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
155 def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
156 def filter(self, f=noop, negate=False, gen=False, **kwargs):
~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
639 res = map(g, iterable)
640 if gen: return res
--> 641 return list(res)
642
643 # Cell
~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
629 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
630 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 631 return self.func(*fargs, **kwargs)
632
633 # Cell
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _call_one(self, event_name)
135 def _call_one(self, event_name):
136 assert hasattr(event, event_name), event_name
--> 137 [cb(event_name) for cb in sort_by_run(self.cbs)]
138
139 def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in <listcomp>(.0)
135 def _call_one(self, event_name):
136 assert hasattr(event, event_name), event_name
--> 137 [cb(event_name) for cb in sort_by_run(self.cbs)]
138
139 def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)
~/fastai2/lib/python3.7/site-packages/fastai/callback/core.py in __call__(self, event_name)
42 (self.run_valid and not getattr(self, 'training', False)))
43 res = None
---> 44 if self.run and _run: res = getattr(self, event_name, noop)()
45 if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
46 return res
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in after_validate(self)
471 def before_validate(self): self._valid_mets.map(Self.reset())
472 def after_train (self): self.log += self._train_mets.map(_maybe_item)
--> 473 def after_validate(self): self.log += self._valid_mets.map(_maybe_item)
474 def after_cancel_train(self): self.cancel_train = True
475 def after_cancel_validate(self): self.cancel_valid = True
~/fastai2/lib/python3.7/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
152 def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
153
--> 154 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
155 def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
156 def filter(self, f=noop, negate=False, gen=False, **kwargs):
~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
639 res = map(g, iterable)
640 if gen: return res
--> 641 return list(res)
642
643 # Cell
~/fastai2/lib/python3.7/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
629 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
630 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 631 return self.func(*fargs, **kwargs)
632
633 # Cell
~/fastai2/lib/python3.7/site-packages/fastai/learner.py in _maybe_item(t)
425 # Cell
426 def _maybe_item(t):
--> 427 t = t.value
428 try: return t.item()
429 except: return t
~/fastai2/lib/python3.7/site-packages/fastai/metrics.py in value(self)
67 preds,targs = torch.cat(self.preds),torch.cat(self.targs)
68 if self.to_np: preds,targs = preds.numpy(),targs.numpy()
---> 69 return self.func(targs, preds, **self.kwargs) if self.invert_args else self.func(preds, targs, **self.kwargs)
70
71 @property
~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/fastai2/lib/python3.7/site-packages/sklearn/metrics/_ranking.py in roc_auc_score(y_true, y_score, average, sample_weight, max_fpr, multi_class, labels)
522 y_type = type_of_target(y_true)
523 y_true = check_array(y_true, ensure_2d=False, dtype=None)
--> 524 y_score = check_array(y_score, ensure_2d=False)
525
526 if y_type == "multiclass" or (y_type == "binary" and
~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
662 if force_all_finite:
663 _assert_all_finite(array,
--> 664 allow_nan=force_all_finite == 'allow-nan')
665
666 if ensure_min_samples > 0:
~/fastai2/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
104 msg_err.format
105 (type_err,
--> 106 msg_dtype if msg_dtype is not None else X.dtype)
107 )
108 # for object dtype data, we only check for NaNs (GH-13254)
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
edit: I switched to the built “timm learner” https://github.com/walkwithfastai/walkwithfastai.github.io/blob/master/nbs/02_vision.external.timm.ipynb
and I also updated fastai and fastcore and everything is working now!