Issue with loading a learner

Feras · April 17, 2021, 5:45pm

I’ve trained a model using mixed precision and saved the learner learn.save("model").

Later, when I load the learner to continue training learn.load("model"), I got the following errror:

RuntimeError: grid_sampler(): expected input and grid to have same dtype, but input has float and grid has c10::Half

I’ve rebuilt the learner before loading as it was before training.

Any idea what am I missing here?

the full traceback:

RuntimeError                              Traceback (most recent call last)
<ipython-input-36-354de1b5d9f4> in <module>()
----> 1 learn.fit_flat_cos(50, 1e-3, wd=1e-2, pct_start=0.1,div_final=10)

27 frames
/usr/local/lib/python3.7/dist-packages/fastai/callback/schedule.py in fit_flat_cos(self, n_epoch, lr, div_final, pct_start, wd, cbs, reset_opt)
    133     lr = np.array([h['lr'] for h in self.opt.hypers])
    134     scheds = {'lr': combined_cos(pct_start, lr, lr, lr/div_final)}
--> 135     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    136 
    137 # Cell

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    210             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    211             self.n_epoch = n_epoch
--> 212             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    213 
    214     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in _do_fit(self)
    201         for epoch in range(self.n_epoch):
    202             self.epoch=epoch
--> 203             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    204 
    205     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in _do_epoch(self)
    196     def _do_epoch(self):
    197         self._do_epoch_train()
--> 198         self._do_epoch_validate()
    199 
    200     def _do_fit(self):

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in _do_epoch_validate(self, ds_idx, dl)
    192         if dl is None: dl = self.dls[ds_idx]
    193         self.dl = dl
--> 194         with torch.no_grad(): self._with_events(self.all_batches, 'validate', CancelValidException)
    195 
    196     def _do_epoch(self):

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

/usr/local/lib/python3.7/dist-packages/fastai/learner.py in all_batches(self)
    164     def all_batches(self):
    165         self.n_iter = len(self.dl)
--> 166         for o in enumerate(self.dl): self.one_batch(*o)
    167 
    168     def _do_one_batch(self):

/usr/local/lib/python3.7/dist-packages/fastai/data/load.py in __iter__(self)
    111             if self.device is not None and multiprocessing.get_start_method().lower() == "fork":
    112                 b = to_device(b, self.device)
--> 113             yield self.after_batch(b)
    114         self.after_iter()
    115         if hasattr(self, 'it'): del(self.it)

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in __call__(self, o)
    196         self.fs.append(t)
    197 
--> 198     def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
    199     def __repr__(self): return f"Pipeline: {' -> '.join([f.name for f in self.fs if f.name != 'noop'])}"
    200     def __getitem__(self,i): return self.fs[i]

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
    148     for f in tfms:
    149         if not is_enc: f = f.decode
--> 150         x = f(x, **kwargs)
    151     return x
    152 

/usr/local/lib/python3.7/dist-packages/fastai/vision/augment.py in __call__(self, b, split_idx, **kwargs)
     33     def __call__(self, b, split_idx=None, **kwargs):
     34         self.before_call(b, split_idx=split_idx)
---> 35         return super().__call__(b, split_idx=split_idx, **kwargs) if self.do else b
     36 
     37 # Cell

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in __call__(self, x, **kwargs)
     71     @property
     72     def name(self): return getattr(self, '_name', _get_name(self))
---> 73     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     74     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     75     def __repr__(self): return f'{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}'

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     81     def _call(self, fn, x, split_idx=None, **kwargs):
     82         if split_idx!=self.split_idx and self.split_idx is not None: return x
---> 83         return self._do_call(getattr(self, fn), x, **kwargs)
     84 
     85     def _do_call(self, f, x, **kwargs):

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     88             ret = f.returns(x) if hasattr(f,'returns') else None
     89             return retain_type(f(x, **kwargs), x, ret)
---> 90         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     91         return retain_type(res, x)
     92 

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in <genexpr>(.0)
     88             ret = f.returns(x) if hasattr(f,'returns') else None
     89             return retain_type(f(x, **kwargs), x, ret)
---> 90         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     91         return retain_type(res, x)
     92 

/usr/local/lib/python3.7/dist-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     87             if f is None: return x
     88             ret = f.returns(x) if hasattr(f,'returns') else None
---> 89             return retain_type(f(x, **kwargs), x, ret)
     90         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     91         return retain_type(res, x)

/usr/local/lib/python3.7/dist-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
    116         elif self.inst is not None: f = MethodType(f, self.inst)
    117         elif self.owner is not None: f = MethodType(f, self.owner)
--> 118         return f(*args, **kwargs)
    119 
    120     def __get__(self, inst, owner):

/usr/local/lib/python3.7/dist-packages/fastai/vision/augment.py in encodes(self, x)
    397         return x.affine_coord(self.mat, coord_func, sz=self.size, mode=mode, pad_mode=self.pad_mode, align_corners=self.align_corners)
    398 
--> 399     def encodes(self, x:TensorImage): return self._encode(x, self.mode)
    400     def encodes(self, x:TensorMask):  return self._encode(x, self.mode_mask)
    401     def encodes(self, x:(TensorPoint, TensorBBox)): return self._encode(x, self.mode, reverse=True)

/usr/local/lib/python3.7/dist-packages/fastai/vision/augment.py in _encode(self, x, mode, reverse)
    395     def _encode(self, x, mode, reverse=False):
    396         coord_func = None if len(self.coord_fs)==0 or self.split_idx else partial(compose_tfms, tfms=self.coord_fs, reverse=reverse)
--> 397         return x.affine_coord(self.mat, coord_func, sz=self.size, mode=mode, pad_mode=self.pad_mode, align_corners=self.align_corners)
    398 
    399     def encodes(self, x:TensorImage): return self._encode(x, self.mode)

/usr/local/lib/python3.7/dist-packages/fastai/vision/augment.py in affine_coord(x, mat, coord_tfm, sz, mode, pad_mode, align_corners)
    319     coords = affine_grid(mat, x.shape[:2] + size, align_corners=align_corners)
    320     if coord_tfm is not None: coords = coord_tfm(coords)
--> 321     return TensorImage(_grid_sample(x, coords, mode=mode, padding_mode=pad_mode, align_corners=align_corners))
    322 
    323 @patch

/usr/local/lib/python3.7/dist-packages/fastai/vision/augment.py in _grid_sample(x, coords, mode, padding_mode, align_corners)
    304             else:
    305                 x = F.interpolate(x, scale_factor=1/d, mode='area')
--> 306     return F.grid_sample(x, coords, mode=mode, padding_mode=padding_mode, align_corners=align_corners)
    307 
    308 # Cell

/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in grid_sample(input, grid, mode, padding_mode, align_corners)
   3361             return handle_torch_function(
   3362                 grid_sample, tens_ops, input, grid, mode=mode, padding_mode=padding_mode,
-> 3363                 align_corners=align_corners)
   3364     if mode != 'bilinear' and mode != 'nearest':
   3365         raise ValueError("nn.functional.grid_sample(): expected mode to be "

/usr/local/lib/python3.7/dist-packages/torch/overrides.py in handle_torch_function(public_api, relevant_args, *args, **kwargs)
   1058         # Use `public_api` instead of `implementation` so __torch_function__
   1059         # implementations can do equality/identity comparisons.
-> 1060         result = overloaded_arg.__torch_function__(public_api, types, args, kwargs)
   1061 
   1062         if result is not NotImplemented:

/usr/local/lib/python3.7/dist-packages/fastai/torch_core.py in __torch_function__(self, func, types, args, kwargs)
    327         convert=False
    328         if _torch_handled(args, self._opt, func): convert,types = type(self),(torch.Tensor,)
--> 329         res = super().__torch_function__(func, types, args=args, kwargs=kwargs)
    330         if convert: res = convert(res)
    331         if isinstance(res, TensorBase): res.set_meta(self, as_copy=True)

/usr/local/lib/python3.7/dist-packages/torch/tensor.py in __torch_function__(cls, func, types, args, kwargs)
    993 
    994         with _C.DisableTorchFunction():
--> 995             ret = func(*args, **kwargs)
    996             return _convert(ret, cls)
    997 

/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in grid_sample(input, grid, mode, padding_mode, align_corners)
   3389         align_corners = False
   3390 
-> 3391     return torch.grid_sampler(input, grid, mode_enum, padding_mode_enum, align_corners)
   3392 
   3393 

RuntimeError: grid_sampler(): expected input and grid to have same dtype, but input has float and grid has c10::Half

muellerzr · April 17, 2021, 5:47pm

Did you use mixed precision at all?

Feras · April 17, 2021, 5:47pm

yup, I did

muellerzr · April 17, 2021, 6:19pm

You need to save as non-fp16 to load it back in, or do learn.to_fp16(). (Hence the half error)

Feras · April 17, 2021, 6:32pm

Thank you Zach.

I tried to do learn.to_fp16() before loading but didn’t help. Also tried to bring it back to 32 after loading. Nothing seems to work.

I guess I have to retrain and save as 32bit.