Hi, I have been getting a series of errors, where I think there were some previous inconsistencies between my install and what s. gugger has been able to replicate. However I am using a fairly standard setup. I even re-did my AMIs and start up scripts to now have the lastest version of the Amazon Deep Learning AMI (27.0) and fastai2 0.0.11 by default. Yet, I still keep running into CUDA / Fastai2 errors. I will try to just jump to the “latest” error, where I got stuck.
- I use camvid as pre-training
- I load a learner:
learn = load_learner('/efs/models/resnet50-camvid', cpu=False)
- give it a dl:
learn.dls = dls
- set up a dl:
dls = SegmentationDataLoaders.from_label_func(path, bs=2,
fnames = imgs,
item_tfms=RandomResizedCrop(512),
label_func = label_fcn,
codes = codes,
batch_tfms=[*aug_transforms(size=(256,360)),
Normalize.from_stats(*imagenet_stats)])
- and then I try to call
learn.fit_one_cycle(1)
orlearn.fit(1)
but I get
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in one_batch(self, i, b)
136 if not self.training: return
--> 137 self.loss.backward(); self('after_backward')
138 self.opt.step(); self('after_step')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
194 """
--> 195 torch.autograd.backward(self, gradient, retain_graph, create_graph)
196
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
98 tensors, grad_tensors, retain_graph, create_graph,
---> 99 allow_unreachable=True) # allow_unreachable flag
100
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-12-251d4239ccd9> in <module>()
1 learn.dls = dls
----> 2 learn.fit_one_cycle(1)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
110 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
111 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
113
114 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
174 try:
175 self.epoch=epoch; self('begin_epoch')
--> 176 self._do_epoch_train()
177 self._do_epoch_validate()
178 except CancelEpochException: self('after_cancel_epoch')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in _do_epoch_train(self)
147 try:
148 self.dl = self.dls.train; self('begin_train')
--> 149 self.all_batches()
150 except CancelTrainException: self('after_cancel_train')
151 finally: self('after_train')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in all_batches(self)
125 def all_batches(self):
126 self.n_iter = len(self.dl)
--> 127 for o in enumerate(self.dl): self.one_batch(*o)
128
129 def one_batch(self, i, b):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in one_batch(self, i, b)
139 self.opt.zero_grad()
140 except CancelBatchException: self('after_cancel_batch')
--> 141 finally: self('after_batch')
142
143 def _do_begin_fit(self, n_epoch):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in __call__(self, event_name)
106 def ordered_cbs(self, cb_func): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, cb_func)]
107
--> 108 def __call__(self, event_name): L(event_name).map(self._call_one)
109 def _call_one(self, event_name):
110 assert hasattr(event, event_name)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in map(self, f, *args, **kwargs)
360 else f.format if isinstance(f,str)
361 else f.__getitem__)
--> 362 return self._new(map(g, self))
363
364 def filter(self, f, negate=False, **kwargs):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in _new(self, items, *args, **kwargs)
313 @property
314 def _xtra(self): return None
--> 315 def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
316 def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
317 def copy(self): return self._new(self.items.copy())
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
39 return x
40
---> 41 res = super().__call__(*((x,) + args), **kwargs)
42 res._newchk = 0
43 return res
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
304 if items is None: items = []
305 if (use_list is not None) or not _is_array(items):
--> 306 items = list(items) if use_list else _listify(items)
307 if match is not None:
308 if is_coll(match): match = len(match)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in _listify(o)
240 if isinstance(o, list): return o
241 if isinstance(o, str) or _is_array(o): return [o]
--> 242 if is_iter(o): return list(o)
243 return [o]
244
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __call__(self, *args, **kwargs)
206 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
207 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 208 return self.fn(*fargs, **kwargs)
209
210 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in _call_one(self, event_name)
109 def _call_one(self, event_name):
110 assert hasattr(event, event_name)
--> 111 [cb(event_name) for cb in sort_by_run(self.cbs)]
112
113 def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in <listcomp>(.0)
109 def _call_one(self, event_name):
110 assert hasattr(event, event_name)
--> 111 [cb(event_name) for cb in sort_by_run(self.cbs)]
112
113 def _bn_bias_state(self, with_bias): return bn_bias_params(self.model, with_bias).map(self.opt.state)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/callback/core.py in __call__(self, event_name)
21 _run = (event_name not in _inner_loop or (self.run_train and getattr(self, 'training', True)) or
22 (self.run_valid and not getattr(self, 'training', False)))
---> 23 if self.run and _run: getattr(self, event_name, noop)()
24 if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
25
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in after_batch(self)
387 if len(self.yb) == 0: return
388 mets = self._train_mets if self.training else self._valid_mets
--> 389 for met in mets: met.accumulate(self.learn)
390 if not self.training: return
391 self.lrs.append(self.opt.hypers[-1]['lr'])
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai2/learner.py in accumulate(self, learn)
350 def accumulate(self, learn):
351 self.count += 1
--> 352 self.val = torch.lerp(to_detach(learn.loss.mean(), gather=False), self.val, self.beta)
353 @property
354 def value(self): return self.val/(1-self.beta**self.count)
RuntimeError: CUDA error: device-side assert triggered
I’ve been getting permutations of this for days. Any suggestions?