Error while running whale competition on dogbreeds notebook

Blanche · May 12, 2018, 12:29pm

I’m on lesson 3, so I’m trying to run various kaggle competitions.
After some small adjustments I’ve tried to run whale competition on dog breed notebook, but I’ve encountered an error when running learn.fit(1e-2,5)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-40-56b353e031cf> in <module>()
----> 1 learn.fit(1e-2,5) # 1e-2 == 10^(-2) == 0.01

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
    285         self.sched = None
    286         layer_opt = self.get_layer_opt(lrs, wds)
--> 287         return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
    288 
    289     def warm_up(self, lr, wds=None):

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
    232             metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
    233             swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 234             swa_eval_freq=swa_eval_freq, **kwargs)
    235 
    236     def get_layer_groups(self): return self.models.get_layer_groups()

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs)
    124         if all_val: val_iter = IterBatch(cur_data.val_dl)
    125 
--> 126         for (*x,y) in t:
    127             batch_num += 1
    128             for cb in callbacks: cb.on_batch_begin()

/usr/local/lib/python3.6/dist-packages/tqdm/_tqdm.py in __iter__(self)
    925 """, fp_write=getattr(self.fp, 'write', sys.stderr.write))
    926 
--> 927             for obj in iterable:
    928                 yield obj
    929                 # Update and possibly print the progressbar.

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/dataloader.py in __iter__(self)
     87                 for c in chunk_iter(iter(self.batch_sampler), self.num_workers*10):
     88                     for batch in e.map(self.get_batch, c):
---> 89                         yield get_tensor(batch, self.pin_memory, self.half)
     90 

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/dataloader.py in get_tensor(batch, pin, half)
     18         return {k: get_tensor(sample, pin, half) for k, sample in batch.items()}
     19     elif isinstance(batch, collections.Sequence):
---> 20         return [get_tensor(sample, pin, half) for sample in batch]
     21     raise TypeError(f"batch must contain numbers, dicts or lists; found {type(batch)}")
     22 

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/dataloader.py in <listcomp>(.0)
     18         return {k: get_tensor(sample, pin, half) for k, sample in batch.items()}
     19     elif isinstance(batch, collections.Sequence):
---> 20         return [get_tensor(sample, pin, half) for sample in batch]
     21     raise TypeError(f"batch must contain numbers, dicts or lists; found {type(batch)}")
     22 

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/dataloader.py in get_tensor(batch, pin, half)
     12         batch = T(batch, half=half, cuda=False).contiguous()
     13         if pin: batch = batch.pin_memory()
---> 14         return to_gpu(batch)
     15     elif isinstance(batch, string_classes):
     16         return batch

/content/clouderizer/fast.ai/fastai/courses/dl1/fastai/core.py in to_gpu(x, *args, **kwargs)
     47 USE_GPU = torch.cuda.is_available()
     48 def to_gpu(x, *args, **kwargs):
---> 49     return x.cuda(*args, **kwargs) if USE_GPU else x
     50 
     51 def noop(*args, **kwargs): return

/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, async)
     67         else:
     68             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 69             return new_type(self.size()).copy_(self, async)
     70 
     71 

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/torch/lib/THC/generic/THCTensorCopy.c:20

I’ve run this notebook with dogbreeds dataset successfully (using google collab).
What can I do to fix this error?

Hadus · May 13, 2018, 2:31pm

What is learn.crit?

Hadus · May 13, 2018, 2:33pm

The cause of the error is not included in the error you posted. Posting some of the code might help others find the reason for the error. Could you tell us the small adjustments you made?