Hi all,
While trying to train a clinical notes language model on a large (7GB) dataset, I just got the error: ‘CUDA error: device-side assert triggered’ after at least 12 hours and at least 38% complete training on first epoch. Traceback is below.
This is my code:
path = datapath4file(‘/media/DataHD2/Notes_PHI_20190121/notes_dana_hp’)
data_lm = load_data(path, fname=‘data_lm_hp_80pct_export.pkl’,bs=32)
learn = language_model_learner(data_lm, AWD_LSTM)
learn.unfreeze()
learn.lr_find()
learn.recorder.plot(skip_end=15)
The data file ‘data_lm_hp_80pct_export.pkl’ was 20,209,788,745 bytes
moms = (0.8,0.7)
print(datetime.datetime.now(), ‘Starting training 10 epochs’,flush=True)
learn.fit_one_cycle(10, slice(1e-2), moms=moms)
print(datetime.datetime.now(), ‘Finished training 10 epoch’,flush=True)
learn.save(‘fit_10_hp_80pct’)
learn.save_encoder(‘fit_10_hp_80pct_enc’)
print(datetime.datetime.now(), ‘Finished saving models’)# Start 2019-03-31; 12:03AM; estimated time 32 hrs / epoch = 320 hours.; ETA 2019-04-13; 8AM
# GPU memory 4178 / 8114; util 81%; progress guage 2209/572361
# 2019-03-31; 9:02AM; 26.83%; 153623 / 572361; Est done Epoch 1 in 24 hr 30 min; GPU same
# 2019-03-31; 12:45PM; 37.93%; 217108 / "; est done Epoch 1 in 20 hr 48 min; same GPU mem.
# 2019-04-01; 9:41AM; CRASHED (sometime before now; same error message)
Here is the traceback:
RuntimeError Traceback (most recent call last)
in
2 moms = (0.8,0.7)
3 print(datetime.datetime.now(), ‘Starting training 10 epochs’,flush=True)
----> 4 learn.fit_one_cycle(10, slice(1e-2), moms=moms)
5 #learn.fit_one_cycle(1, 1e-2)
6 print(datetime.datetime.now(), ‘Finished training 10 epoch’,flush=True)~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
20 callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
21 final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
—> 22 learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
23
24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
194 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
195 if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
→ 196 fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
197
198 def create_opt(self, lr:Floats, wd:Floats=0.)->None:~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
98 for xb,yb in progress_bar(learn.data.train_dl, parent=pbar):
99 xb, yb = cb_handler.on_batch_begin(xb, yb)
→ 100 loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler)
101 if cb_handler.on_batch_end(loss): break
102~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
23 if not is_listy(xb): xb = [xb]
24 if not is_listy(yb): yb = [yb]
—> 25 out = model(*xb)
26 out = cb_handler.on_loss_begin(out)
27~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/torch/nn/modules/module.py in call(self, *input, **kwargs)
487 result = self._slow_forward(*input, **kwargs)
488 else:
→ 489 result = self.forward(*input, **kwargs)
490 for hook in self._forward_hooks.values():
491 hook_result = hook(self, input, result)~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
90 def forward(self, input):
91 for module in self._modules.values():
—> 92 input = module(input)
93 return input
94~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/torch/nn/modules/module.py in call(self, *input, **kwargs)
487 result = self._slow_forward(*input, **kwargs)
488 else:
→ 489 result = self.forward(*input, **kwargs)
490 for hook in self._forward_hooks.values():
491 hook_result = hook(self, input, result)~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/models/awd_lstm.py in forward(self, input, from_embeddings)
109 self.bs=bs
110 self.reset()
→ 111 raw_output = self.input_dp(input if from_embeddings else self.encoder_dp(input))
112 new_hidden,raw_outputs,outputs = ,,
113 for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/torch/nn/modules/module.py in call(self, *input, **kwargs)
487 result = self._slow_forward(*input, **kwargs)
488 else:
→ 489 result = self.forward(*input, **kwargs)
490 for hook in self._forward_hooks.values():
491 hook_result = hook(self, input, result)~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/models/awd_lstm.py in forward(self, x)
21 def forward(self, x:Tensor)->Tensor:
22 if not self.training or self.p == 0.: return x
—> 23 m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
24 return x * m
25~/anaconda3/envs/fastaiv1/lib/python3.6/site-packages/fastai/text/models/awd_lstm.py in dropout_mask(x, sz, p)
10 def dropout_mask(x:Tensor, sz:Collection[int], p:float):
11 “Return a dropout mask of the same type asx
, sizesz
, with probabilityp
to cancel an element.”
—> 12 return x.new(*sz).bernoulli_(1-p).div_(1-p)
13
14 class RNNDropout(nn.Module):RuntimeError: CUDA error: device-side assert triggered
During the first few hours of training, my memory numbers appeared to be low and stable. I was stable at 4178 / 8114 total GPU Memory used on one NVidia 1080 GPU. My motherboard memory seems to be at 34.3GB / 62.8GB total available. This didn’t change over the course of training, so I thought of two possible crash reasons:
- Something in my data was unexpected. This was epoch 1, so I don’t know my data is OK for sure.
- Maybe the training hit a bug at the end of Epoch 1; I don’t know whether it got that far.
So I’m figuring I need to figure out which data sample that it crashed on. I used %debug and the debugger seemed to work fine, but when I tried to look at the current data at any level in the traceback stack, I would get another CUDA error:
*** RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1544174967633/work/aten/src/THC/THCCachingHostAllocator.cpp:265
Any thoughts on how I can approach debugging this mess?
I know I should first try training the model on smaller data sets, and I did, but they did not reproduce the error.