Dataset description: I have a segmentation dataset with binary label
when I execute:
learn = unet_learner(data, models.resnet34, metrics=metrics, wd=wd)
it throws error like this :CUDA error: device-side assert triggered
How to fix this?
RuntimeError Traceback (most recent call last)
in
----> 1 lr_find(learn)
2 learn.recorder.plot()
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
30 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
31 a = int(np.ceil(num_it/len(learn.data.train_dl)))
—> 32 learn.fit(a, start_lr, callbacks=[cb], wd=wd)
33
34 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=False, clip:float=None,
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
176 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
177 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
–> 178 callbacks=self.callbacks+callbacks)
179
180 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/utils/mem.py in wrapper(*args, **kwargs)
85 type, val, tb = get_ref_free_exc_info() # must!
86 gc.collect()
—> 87 raise type(val).with_traceback(tb) from None
88 else: raise # re-raises the exact last exception
89 return wrapper
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/utils/mem.py in wrapper(*args, **kwargs)
78
79 try:
—> 80 return func(*args, **kwargs)
81 except Exception as e:
82 if (“CUDA out of memory” in str(e) or
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/basic_train.py in fit(failed resolving arguments)
99 exception = e
100 raise
–> 101 finally: cb_handler.on_train_end(exception)
102
103 loss_func_name2activ = {‘cross_entropy_loss’: F.softmax, ‘nll_loss’: torch.exp, ‘poisson_nll_loss’: torch.exp,
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callback.py in on_train_end(failed resolving arguments)
275 def on_train_end(self, exception:Union[bool,Exception])->None:
276 “Handle end of training, exception
is an Exception
or False if no exceptions during training.”
–> 277 self(‘train_end’, exception=exception)
278
279 class AverageMetric(Callback):
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callback.py in call(failed resolving arguments)
199 “Call through to all of the CallbakHandler
functions.”
200 if call_mets: [getattr(met, f’on_{cb_name}’)(**self.state_dict, **kwargs) for met in self.metrics]
–> 201 return [getattr(cb, f’on_{cb_name}’)(**self.state_dict, **kwargs) for cb in self.callbacks]
202
203 def set_dl(self, dl:DataLoader):
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callback.py in (failed resolving arguments)
199 “Call through to all of the CallbakHandler
functions.”
200 if call_mets: [getattr(met, f’on_{cb_name}’)(**self.state_dict, **kwargs) for met in self.metrics]
–> 201 return [getattr(cb, f’on_{cb_name}’)(**self.state_dict, **kwargs) for cb in self.callbacks]
202
203 def set_dl(self, dl:DataLoader):
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callbacks/lr_finder.py in on_train_end(failed resolving arguments)
43 # restore the valid_dl we turned off on __init__
44 self.data.valid_dl = self.valid_dl
—> 45 self.learn.load(‘tmp’)
46 if hasattr(self.learn.model, ‘reset’): self.learn.model.reset()
47 print(‘LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.’)
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/basic_train.py in load(failed resolving arguments)
237 if purge: self.purge(clear_opt = ifnone(with_opt, False))
238 if device is None: device = self.data.device
–> 239 state = torch.load(self.path/self.model_dir/f’{name}.pth’, map_location=device)
240 if set(state.keys()) == {‘model’, ‘opt’}:
241 get_model(self.model).load_state_dict(state[‘model’], strict=strict)
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in load(failed resolving arguments)
365 f = open(f, ‘rb’)
366 try:
–> 367 return _load(f, map_location, pickle_module)
368 finally:
369 if new_fd:
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in _load(failed resolving arguments)
536 unpickler = pickle_module.Unpickler(f)
537 unpickler.persistent_load = persistent_load
–> 538 result = unpickler.load()
539
540 deserialized_storage_keys = pickle_module.load(f)
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in persistent_load(failed resolving arguments)
502 if root_key not in deserialized_objects:
503 deserialized_objects[root_key] = restore_location(
–> 504 data_type(size), location)
505 storage = deserialized_objects[root_key]
506 if view_metadata is not None:
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in restore_location(failed resolving arguments)
385 elif isinstance(map_location, torch.device):
386 def restore_location(storage, location):
–> 387 return default_restore_location(storage, str(map_location))
388 else:
389 def restore_location(storage, location):
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in default_restore_location(failed resolving arguments)
111 def default_restore_location(storage, location):
112 for _, _, fn in _package_registry:
–> 113 result = fn(storage, location)
114 if result is not None:
115 return result
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in _cuda_deserialize(failed resolving arguments)
93 if location.startswith(‘cuda’):
94 device = validate_cuda_device(location)
—> 95 return obj.cuda(device)
96
97
/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/_utils.py in cuda(failed resolving arguments)
74 else:
75 new_type = getattr(torch.cuda, self.class.name)
—> 76 return new_type(self.size()).copy(self, non_blocking)
77
78
RuntimeError: CUDA error: device-side assert triggered