Cuda error in Segmentation

sanketg · March 8, 2019, 8:51pm

Dataset description: I have a segmentation dataset with binary label
when I execute:
learn = unet_learner(data, models.resnet34, metrics=metrics, wd=wd)
it throws error like this :CUDA error: device-side assert triggered
How to fix this?
RuntimeError Traceback (most recent call last)
in
----> 1 lr_find(learn)
2 learn.recorder.plot()

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, wd)
30 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
31 a = int(np.ceil(num_it/len(learn.data.train_dl)))
—> 32 learn.fit(a, start_lr, callbacks=[cb], wd=wd)
33
34 def to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=False, clip:float=None,

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
176 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
177 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
–> 178 callbacks=self.callbacks+callbacks)
179
180 def create_opt(self, lr:Floats, wd:Floats=0.)->None:

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/utils/mem.py in wrapper(*args, **kwargs)
85 type, val, tb = get_ref_free_exc_info() # must!
86 gc.collect()
—> 87 raise type(val).with_traceback(tb) from None
88 else: raise # re-raises the exact last exception
89 return wrapper

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/utils/mem.py in wrapper(*args, **kwargs)
78
79 try:
—> 80 return func(*args, **kwargs)
81 except Exception as e:
82 if (“CUDA out of memory” in str(e) or

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/basic_train.py in fit(failed resolving arguments)
99 exception = e
100 raise
–> 101 finally: cb_handler.on_train_end(exception)
102
103 loss_func_name2activ = {‘cross_entropy_loss’: F.softmax, ‘nll_loss’: torch.exp, ‘poisson_nll_loss’: torch.exp,

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callback.py in on_train_end(failed resolving arguments)
275 def on_train_end(self, exception:Union[bool,Exception])->None:
276 “Handle end of training, exception is an Exception or False if no exceptions during training.”
–> 277 self(‘train_end’, exception=exception)
278
279 class AverageMetric(Callback):

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callback.py in call(failed resolving arguments)
199 “Call through to all of the CallbakHandler functions.”
200 if call_mets: [getattr(met, f’on_{cb_name}’)(**self.state_dict, **kwargs) for met in self.metrics]
–> 201 return [getattr(cb, f’on_{cb_name}’)(**self.state_dict, **kwargs) for cb in self.callbacks]
202
203 def set_dl(self, dl:DataLoader):

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callback.py in (failed resolving arguments)
199 “Call through to all of the CallbakHandler functions.”
200 if call_mets: [getattr(met, f’on_{cb_name}’)(**self.state_dict, **kwargs) for met in self.metrics]
–> 201 return [getattr(cb, f’on_{cb_name}’)(**self.state_dict, **kwargs) for cb in self.callbacks]
202
203 def set_dl(self, dl:DataLoader):

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/callbacks/lr_finder.py in on_train_end(failed resolving arguments)
43 # restore the valid_dl we turned off on __init__
44 self.data.valid_dl = self.valid_dl
—> 45 self.learn.load(‘tmp’)
46 if hasattr(self.learn.model, ‘reset’): self.learn.model.reset()
47 print(‘LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.’)

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/fastai/basic_train.py in load(failed resolving arguments)
237 if purge: self.purge(clear_opt = ifnone(with_opt, False))
238 if device is None: device = self.data.device
–> 239 state = torch.load(self.path/self.model_dir/f’{name}.pth’, map_location=device)
240 if set(state.keys()) == {‘model’, ‘opt’}:
241 get_model(self.model).load_state_dict(state[‘model’], strict=strict)

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in load(failed resolving arguments)
365 f = open(f, ‘rb’)
366 try:
–> 367 return _load(f, map_location, pickle_module)
368 finally:
369 if new_fd:

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in _load(failed resolving arguments)
536 unpickler = pickle_module.Unpickler(f)
537 unpickler.persistent_load = persistent_load
–> 538 result = unpickler.load()
539
540 deserialized_storage_keys = pickle_module.load(f)

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in persistent_load(failed resolving arguments)
502 if root_key not in deserialized_objects:
503 deserialized_objects[root_key] = restore_location(
–> 504 data_type(size), location)
505 storage = deserialized_objects[root_key]
506 if view_metadata is not None:

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in restore_location(failed resolving arguments)
385 elif isinstance(map_location, torch.device):
386 def restore_location(storage, location):
–> 387 return default_restore_location(storage, str(map_location))
388 else:
389 def restore_location(storage, location):

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in default_restore_location(failed resolving arguments)
111 def default_restore_location(storage, location):
112 for _, _, fn in _package_registry:
–> 113 result = fn(storage, location)
114 if result is not None:
115 return result

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/serialization.py in _cuda_deserialize(failed resolving arguments)
93 if location.startswith(‘cuda’):
94 device = validate_cuda_device(location)
—> 95 return obj.cuda(device)
96
97

/new_data/gpu/sanketg/anaconda3/envs/pytorch1/lib/python3.6/site-packages/torch/_utils.py in cuda(failed resolving arguments)
74 else:
75 new_type = getattr(torch.cuda, self.class.name)
—> 76 return new_type(self.size()).copy(self, non_blocking)
77
78

RuntimeError: CUDA error: device-side assert triggered

SapirGershov · March 11, 2019, 6:38am

I’ve also struggled with this error. Eventually, the cause were my labels.
How exactly did you created them? Is by any chance they have values of 255?
How did you set your classes? did you use a text file?

dhoa · March 11, 2019, 7:39am

The error might because of the value of your mask images as @SapirGershov mentioned. Can you re-assign your mask value in the range of the number of classes and check the result ? Ex: If you have 4 classes then use just value 0, 1, 2, 3 for your mask.

sanketg · March 11, 2019, 9:54am

the labels are just black and white images, black and white are two classes and i used a text file of codes to represent these two classes

sanketg · March 11, 2019, 10:36am

How can i reassign the mask values ? as i am giving images as labels? I have a code file that lists the two classes.

dhoa · March 11, 2019, 10:43am

I’m guessing your black value is 0 and white is 255. So might be you can do something like img[img == 255] = 1 in numpy.

Do a test and tell us what you see.

Hope that helps,

sanketg · March 11, 2019, 12:29pm

It worked. thanks

OlgaUW · October 20, 2019, 5:00am

Hi dhoa,

I have the same error. I tried renaming classes to 0, 1, 2, 3 but it did not help.

RuntimeError Traceback (most recent call last)
in
----> 1 learn = unet_learner(data, models.resnet34, metrics=metrics, wd=wd)

/opt/anaconda3/lib/python3.7/site-packages/fastai/vision/learner.py in unet_learner(data, arch, pretrained, blur_final, norm_type, split_on, blur, self_attention, y_range, last_cross, bottle, cut, **learn_kwargs)
118 model = to_device(models.unet.DynamicUnet(body, n_classes=data.c, img_size=size, blur=blur, blur_final=blur_final,
119 self_attention=self_attention, y_range=y_range, norm_type=norm_type, last_cross=last_cross,
–> 120 bottle=bottle), data.device)
121 learn = Learner(data, model, **learn_kwargs)
122 learn.split(ifnone(split_on, meta[‘split’]))

/opt/anaconda3/lib/python3.7/site-packages/fastai/torch_core.py in to_device(b, device)
121 “Recursively put b on device.”
122 device = ifnone(device, defaults.device)
–> 123 return recurse(lambda x: x.to(device, non_blocking=True), b)
124
125 def data_collate(batch:ItemsList)->Tensor:

/opt/anaconda3/lib/python3.7/site-packages/fastai/core.py in recurse(func, x, *args, **kwargs)
78 if is_listy(x): return [recurse(func, o, *args, **kwargs) for o in x]
79 if is_dict(x): return {k: recurse(func, v, *args, **kwargs) for k,v in x.items()}
—> 80 return func(x, *args, **kwargs)
81
82 def first_el(x: Any)->Any:

/opt/anaconda3/lib/python3.7/site-packages/fastai/torch_core.py in (x)
121 “Recursively put b on device.”
122 device = ifnone(device, defaults.device)
–> 123 return recurse(lambda x: x.to(device, non_blocking=True), b)
124
125 def data_collate(batch:ItemsList)->Tensor:

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
430 return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
431
–> 432 return self._apply(convert)
433
434 def register_backward_hook(self, hook):

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
206 def _apply(self, fn):
207 for module in self.children():
–> 208 module._apply(fn)
209
210 def compute_should_use_set_data(tensor, tensor_applied):

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
228 # with torch.no_grad():
229 with torch.no_grad():
–> 230 param_applied = fn(param)
231 should_use_set_data = compute_should_use_set_data(param, param_applied)
232 if should_use_set_data:

/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in convert(t)
428
429 def convert(t):
–> 430 return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
431
432 return self._apply(convert)

RuntimeError: CUDA error: device-side assert triggered