Hello.
@muellerzr Congrats for the https://github.com/muellerzr/Practical-Deep-Learning-for-Coders-2.0/blob/master/Computer%20Vision/06_Hybridizing_Models.ipynb notebook !
I have a question related to the models hybridisation (that I’ve already mentioned somehow in a previous post).
I want to create a DynamicUnet model that is able to handle 2 inputs (image + image/mask).
So I’m defining a DataBlock that loads the CamVid data and has a secondary input which is the mask we want to predict. It doesnt make any sense since that will be equivalent to “cheating” (we give the target in the input) but let’s ignore it.
camvid = DataBlock(blocks=(ImageBlock, MaskBlock(codes), MaskBlock(codes)),
get_items=get_image_files,
splitter=ListSplitter(valid_fnames),
get_y=lambda o: path/'labels'/f'{o.stem}_P{o.suffix}',
batch_tfms=[*aug_transforms(size=(360,480)), Normalize.from_stats(*imagenet_stats)], n_inp=2)
Then I’m creating a class inheriting from SequentialEx that can handle 2 inputs. The second input is ignored for the moment.
class CustomSequentialEx(SequentialEx):
def forward(self, x, x2):
res = x
for l in self.layers:
res.orig = x
nres = l(res)
# We have to remove res.orig to avoid hanging refs and therefore memory leaks
res.orig = None
res = nres
return res
I create a new DynamicUnetBis that inherits from CustomSequentialEx and that has an identical implementation with DynamicUnet:
DynamicUnetBis(CustomSequentialEx):
the exact implementation from DynamicUnet
And then something similar with what you did:
def custom_unet(dls, arch, loss_func=None, pretrained=True, cut=None, splitter=None, config=None, n_in=3, n_out=None,
normalize=True, **kwargs):
"Build a unet learner from `dls` and `arch`"
if config is None: config = unet_config()
meta = model_meta.get(arch, _default_meta)
body = create_body(arch, n_in, pretrained, ifnone(cut, meta['cut']))
size = dls.one_batch()[0].shape[-2:]
if n_out is None: n_out = get_c(dls)
assert n_out, "`n_out` is not defined, and could not be infered from data, set `dls.c` or pass `n_out`"
if normalize: _add_norm(dls, meta, pretrained)
model = DynamicUnetBis(body, n_out, size, **config) # HERE
learn = Learner(dls, model, loss_func=loss_func, splitter=ifnone(splitter, meta['split']), **kwargs)
if pretrained: learn.freeze()
return learn
Then i create a custom_unet learner instance:
opt_func = partial(Adam, lr=3e-3, wd=0.01)#, eps=1e-8)
learn = custom_unet(dls, resnet18, pretrained=False,
opt_func=opt_func, path=path, metrics=acc_camvid,
config = unet_config(norm_type=None), wd_bn_bias=True)
When I’m calling:
bs = dls.one_batch()
output = learn.model(bs[0], bs[0])
output.shape
i get the output:
torch.Size([2, 32, 360, 480]).
Everything good till now !
But then when I call:
learn.lr_find()
i get this lovely error. Any idea why this is happening? @sgugger
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/workspace/fastai2/fastai2/learner.py in one_batch(self, i, b)
136 if not self.training: return
--> 137 self.loss.backward(); self('after_backward')
138 self.opt.step(); self('after_step')
/opt/conda/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
194 """
--> 195 torch.autograd.backward(self, gradient, retain_graph, create_graph)
196
/opt/conda/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
98 tensors, grad_tensors, retain_graph, create_graph,
---> 99 allow_unreachable=True) # allow_unreachable flag
100
RuntimeError: cuda runtime error (710) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:26
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
~/workspace/fastai2/fastai2/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
175 self.epoch=epoch; self('begin_epoch')
--> 176 self._do_epoch_train()
177 self._do_epoch_validate()
~/workspace/fastai2/fastai2/learner.py in _do_epoch_train(self)
148 self.dl = self.dls.train; self('begin_train')
--> 149 self.all_batches()
150 except CancelTrainException: self('after_cancel_train')
~/workspace/fastai2/fastai2/learner.py in all_batches(self)
126 self.n_iter = len(self.dl)
--> 127 for o in enumerate(self.dl): self.one_batch(*o)
128
~/workspace/fastai2/fastai2/learner.py in one_batch(self, i, b)
140 except CancelBatchException: self('after_cancel_batch')
--> 141 finally: self('after_batch')
142
~/workspace/fastai2/fastai2/learner.py in __call__(self, event_name)
107
--> 108 def __call__(self, event_name): L(event_name).map(self._call_one)
109 def _call_one(self, event_name):
~/workspace/fastcore/fastcore/foundation.py in map(self, f, *args, **kwargs)
361 else f.__getitem__)
--> 362 return self._new(map(g, self))
363
~/workspace/fastcore/fastcore/foundation.py in _new(self, items, *args, **kwargs)
314 def _xtra(self): return None
--> 315 def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
316 def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
~/workspace/fastcore/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
40
---> 41 res = super().__call__(*((x,) + args), **kwargs)
42 res._newchk = 0
~/workspace/fastcore/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
305 if (use_list is not None) or not _is_array(items):
--> 306 items = list(items) if use_list else _listify(items)
307 if match is not None:
~/workspace/fastcore/fastcore/foundation.py in _listify(o)
241 if isinstance(o, str) or _is_array(o): return [o]
--> 242 if is_iter(o): return list(o)
243 return [o]
~/workspace/fastcore/fastcore/foundation.py in __call__(self, *args, **kwargs)
207 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 208 return self.fn(*fargs, **kwargs)
209
~/workspace/fastai2/fastai2/learner.py in _call_one(self, event_name)
110 assert hasattr(event, event_name)
--> 111 [cb(event_name) for cb in sort_by_run(self.cbs)]
112
~/workspace/fastai2/fastai2/learner.py in <listcomp>(.0)
110 assert hasattr(event, event_name)
--> 111 [cb(event_name) for cb in sort_by_run(self.cbs)]
112
~/workspace/fastai2/fastai2/callback/core.py in __call__(self, event_name)
22 (self.run_valid and not getattr(self, 'training', False)))
---> 23 if self.run and _run: getattr(self, event_name, noop)()
24 if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
~/workspace/fastai2/fastai2/learner.py in after_batch(self)
388 mets = self._train_mets if self.training else self._valid_mets
--> 389 for met in mets: met.accumulate(self.learn)
390 if not self.training: return
~/workspace/fastai2/fastai2/learner.py in accumulate(self, learn)
351 self.count += 1
--> 352 self.val = torch.lerp(to_detach(learn.loss.mean(), gather=False), self.val, self.beta)
353 @property
RuntimeError: CUDA error: device-side assert triggered
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-14-d81c6bd29d71> in <module>
----> 1 learn.lr_find()
~/workspace/fastai2/fastai2/callback/schedule.py in lr_find(self, start_lr, end_lr, num_it, stop_div, show_plot, suggestions)
195 n_epoch = num_it//len(self.dls.train) + 1
196 cb=LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
--> 197 with self.no_logging(): self.fit(n_epoch, cbs=cb)
198 if show_plot: self.recorder.plot_lr_find()
199 if suggestions:
~/workspace/fastai2/fastai2/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
180
181 except CancelFitException: self('after_cancel_fit')
--> 182 finally: self('after_fit')
183
184 def validate(self, ds_idx=1, dl=None, cbs=None):
~/workspace/fastai2/fastai2/learner.py in __call__(self, event_name)
106 def ordered_cbs(self, cb_func): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, cb_func)]
107
--> 108 def __call__(self, event_name): L(event_name).map(self._call_one)
109 def _call_one(self, event_name):
110 assert hasattr(event, event_name)
~/workspace/fastcore/fastcore/foundation.py in map(self, f, *args, **kwargs)
360 else f.format if isinstance(f,str)
361 else f.__getitem__)
--> 362 return self._new(map(g, self))
363
364 def filter(self, f, negate=False, **kwargs):
~/workspace/fastcore/fastcore/foundation.py in _new(self, items, *args, **kwargs)
313 @property
314 def _xtra(self): return None
--> 315 def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
316 def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
317 def copy(self): return self._new(self.items.copy())
~/workspace/fastcore/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
39 return x
40
---> 41 res = super().__call__(*((x,) + args), **kwargs)
42 res._newchk = 0
43 return res
~/workspace/fastcore/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
304 if items is None: items = []
305 if (use_list is not None) or not _is_array(items):
--> 306 items = list(items) if use_list else _listify(items)
307 if match is not None:
308 if is_coll(match): match = len(match)
~/workspace/fastcore/fastcore/foundation.py in _listify(o)
240 if isinstance(o, list): return o
241 if isinstance(o, str) or _is_array(o): return [o]
--> 242 if is_iter(o): return list(o)
243 return [o]
244
~/workspace/fastcore/fastcore/foundation.py in __call__(self, *args, **kwargs)
206 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
207 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 208 return self.fn(*fargs, **kwargs)
209
210 # Cell
~/workspace/fastai2/fastai2/learner.py in _call_one(self, event_name)
109 def _call_one(self, event_name):
110 assert hasattr(event, event_name)
--> 111 [cb(event_name) for cb in sort_by_run(self.cbs)]
112
113 def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)
~/workspace/fastai2/fastai2/learner.py in <listcomp>(.0)
109 def _call_one(self, event_name):
110 assert hasattr(event, event_name)
--> 111 [cb(event_name) for cb in sort_by_run(self.cbs)]
112
113 def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)
~/workspace/fastai2/fastai2/callback/core.py in __call__(self, event_name)
21 _run = (event_name not in _inner_loop or (self.run_train and getattr(self, 'training', True)) or
22 (self.run_valid and not getattr(self, 'training', False)))
---> 23 if self.run and _run: getattr(self, event_name, noop)()
24 if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
25
~/workspace/fastai2/fastai2/callback/schedule.py in after_fit(self)
168 tmp_f = self.path/self.model_dir/'_tmp.pth'
169 if tmp_f.exists():
--> 170 self.learn.load('_tmp')
171 os.remove(tmp_f)
172
~/workspace/fastai2/fastai2/learner.py in load(self, file, with_opt, device, strict)
254 distrib_barrier()
255 file = join_path_file(file, self.path/self.model_dir, ext='.pth')
--> 256 load_model(file, self.model, self.opt, with_opt=with_opt, device=device, strict=strict)
257 return self
258
~/workspace/fastai2/fastai2/learner.py in load_model(file, model, opt, with_opt, device, strict)
39 if isinstance(device, int): device = torch.device('cuda', device)
40 elif device is None: device = 'cpu'
---> 41 state = torch.load(file, map_location=device)
42 hasopt = set(state)=={'model', 'opt'}
43 model_state = state['model'] if hasopt else state
/opt/conda/lib/python3.6/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
527 with _open_zipfile_reader(f) as opened_zipfile:
528 return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
--> 529 return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
530
531
/opt/conda/lib/python3.6/site-packages/torch/serialization.py in _legacy_load(f, map_location, pickle_module, **pickle_load_args)
707 for key in deserialized_storage_keys:
708 assert key in deserialized_objects
--> 709 deserialized_objects[key]._set_from_file(f, offset, f_should_read_directly)
710 if offset is not None:
711 offset = f.tell()
RuntimeError: cuda runtime error (710) : device-side assert triggered at /pytorch/torch/csrc/generic/serialization.cpp:148