HI there,
I ran into a CUDA problem in kaggle Humpback Whale Identification Challenge. I write the code under the step of lesson 1 & 2. In the end of one epoch, it throw out a CUDA error. I have try to set metrics=None, and tune the bs & sz, but they didn’t work.
For anyone bold enough to have read this far, any ideas on what I may have pooched?
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-15-21752d933788> in <module>()
----> 1 learn.fit(lr, 1)
~/kaggle/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
213 self.sched = None
214 layer_opt = self.get_layer_opt(lrs, wds)
--> 215 return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
216
217 def warm_up(self, lr, wds=None):
~/kaggle/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, **kwargs)
160 n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
161 return fit(model, data, n_epoch, layer_opt.opt, self.crit,
--> 162 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, **kwargs)
163
164 def get_layer_groups(self): return self.models.get_layer_groups()
~/kaggle/fastai/model.py in fit(model, data, epochs, opt, crit, metrics, callbacks, stepper, **kwargs)
104 i += 1
105
--> 106 vals = validate(stepper, data.val_dl, metrics)
107 if epoch == 0: print(layout.format(*names))
108 print_stats(epoch, [debias_loss] + vals)
~/kaggle/fastai/model.py in validate(stepper, dl, metrics)
125 for (*x,y) in iter(dl):
126 preds,l = stepper.evaluate(VV(x), VV(y))
--> 127 loss.append(to_np(l))
128 res.append([f(preds.data,y) for f in metrics])
129 return [np.mean(loss)] + list(np.mean(np.stack(res),0))
~/kaggle/fastai/core.py in to_np(v)
38 if isinstance(v, (list,tuple)): return [to_np(o) for o in v]
39 if isinstance(v, Variable): v=v.data
---> 40 return v.cpu().numpy()
41
42 USE_GPU=True
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/tensor.py in cpu(self)
43 def cpu(self):
44 r"""Returns a CPU copy of this tensor if it's not already on the CPU"""
---> 45 return self.type(getattr(torch, self.__class__.__name__))
46
47 def double(self):
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in type(self, *args, **kwargs)
394 def type(self, *args, **kwargs):
395 with device(self.get_device()):
--> 396 return super(_CudaBase, self).type(*args, **kwargs)
397
398 __new__ = _lazy_new
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/_utils.py in _type(self, new_type, async)
36 if new_type.is_sparse:
37 raise RuntimeError("Cannot cast dense tensor to sparse tensor")
---> 38 return new_type(self.size()).copy_(self, async)
39
40
RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generic/THCTensorCopy.c:70
My code as blow:
arch = resnet34
bs = 64
PATH = 'data/whale'
def mvp5(preds, targs):
preds = np.exp(preds)
min5 = np.sort(preds)[:, :5]
return np.mean(min5)
metrics = [mvp5]
def get_data(sz):
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.05)
return ImageClassifierData.from_csv(PATH, 'train', f'{PATH}/train.csv', bs=bs, val_idxs=val_idxs,
tfms=tfms, test_name='test')
sz = 224
data = get_data(sz)
len(data.val_ds.fnames), len(data.val_ds.y)
(1970, 1970)
learn = ConvLearner.pretrained(arch, data, metrics=metrics)
learn.lr_find()
learn.sched.plot()
lr = 0.4
learn.fit(lr, 1) # ERROR