Out of CUDA memory when trying to train the NLP notebook from the quickstart

Hi all,

I am trying to learn fastai. I’ve watched lesson 1 and gone thru most of the quickstart guide. The NLP quickstart however, will never finish training. (Stack Trace below)

RuntimeError                              Traceback (most recent call last)
Cell In [3], line 3
      1 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
      2 learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
----> 3 learn.fine_tune(2, 1e-2)

File ~/ML/venv/lib/python3.10/site-packages/fastai/callback/schedule.py:168, in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    166 base_lr /= 2
    167 self.unfreeze()
--> 168 self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)

File ~/ML/venv/lib/python3.10/site-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
    116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
    117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    118           'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:256, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
    254 self.opt.set_hypers(lr=self.lr if lr is None else lr)
    255 self.n_epoch = n_epoch
--> 256 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
    192 def _with_events(self, f, event_type, ex, final=noop):
--> 193     try: self(f'before_{event_type}');  f()
    194     except ex: self(f'after_cancel_{event_type}')
    195     self(f'after_{event_type}');  final()

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:245, in Learner._do_fit(self)
    243 for epoch in range(self.n_epoch):
    244     self.epoch=epoch
--> 245     self._with_events(self._do_epoch, 'epoch', CancelEpochException)

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
    192 def _with_events(self, f, event_type, ex, final=noop):
--> 193     try: self(f'before_{event_type}');  f()
    194     except ex: self(f'after_cancel_{event_type}')
    195     self(f'after_{event_type}');  final()

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:239, in Learner._do_epoch(self)
    238 def _do_epoch(self):
--> 239     self._do_epoch_train()
    240     self._do_epoch_validate()

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:231, in Learner._do_epoch_train(self)
    229 def _do_epoch_train(self):
    230     self.dl = self.dls.train
--> 231     self._with_events(self.all_batches, 'train', CancelTrainException)

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
    192 def _with_events(self, f, event_type, ex, final=noop):
--> 193     try: self(f'before_{event_type}');  f()
    194     except ex: self(f'after_cancel_{event_type}')
    195     self(f'after_{event_type}');  final()

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:199, in Learner.all_batches(self)
    197 def all_batches(self):
    198     self.n_iter = len(self.dl)
--> 199     for o in enumerate(self.dl): self.one_batch(*o)

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:227, in Learner.one_batch(self, i, b)
    225 b = self._set_device(b)
    226 self._split(b)
--> 227 self._with_events(self._do_one_batch, 'batch', CancelBatchException)

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
    192 def _with_events(self, f, event_type, ex, final=noop):
--> 193     try: self(f'before_{event_type}');  f()
    194     except ex: self(f'after_cancel_{event_type}')
    195     self(f'after_{event_type}');  final()

File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:205, in Learner._do_one_batch(self)
    204 def _do_one_batch(self):
--> 205     self.pred = self.model(*self.xb)
    206     self('after_pred')
    207     if len(self.yb):

File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/container.py:139, in Sequential.forward(self, input)
    137 def forward(self, input):
    138     for module in self:
--> 139         input = module(input)
    140     return input

File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/ML/venv/lib/python3.10/site-packages/fastai/text/models/core.py:98, in SentenceEncoder.forward(self, input)
     95 for i in range(0, sl, self.bptt):
     96     #Note: this expects that sequence really begins on a round multiple of bptt
     97     real_bs = (input[:,i] != self.pad_idx).long().sum()
---> 98     o = self.module(input[:real_bs,i: min(i+self.bptt, sl)])
     99     if self.max_len is None or sl-i <= self.max_len:
    100         outs.append(o)

File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/ML/venv/lib/python3.10/site-packages/fastai/text/models/awdlstm.py:125, in AWD_LSTM.forward(self, inp, from_embeds)
    122 bs,sl = inp.shape[:2] if from_embeds else inp.shape
    123 if bs!=self.bs: self._change_hidden(bs)
--> 125 output = self.input_dp(inp if from_embeds else self.encoder_dp(inp))
    126 new_hidden = []
    127 for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):

File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/ML/venv/lib/python3.10/site-packages/fastai/text/models/awdlstm.py:86, in EmbeddingDropout.forward(self, words, scale)
     84     size = (self.emb.weight.size(0),1)
     85     mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
---> 86     masked_embed = self.emb.weight * mask
     87 else: masked_embed = self.emb.weight
     88 if scale: masked_embed.mul_(scale)

RuntimeError: CUDA out of memory. Tried to allocate 92.00 MiB (GPU 0; 5.93 GiB total capacity; 4.76 GiB already allocated; 58.00 MiB free; 4.87 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Using nvidia-smi, the CUDA is at about 2G (out of 6G) right up until it errors, where it suddenly jumps to 6G. I’ve tried lower batch sizes, they just error later. Eventually, it ends up taking around an hour. From what the lesson said this shouldn’t be the case, and it should be fairly quick. I have a Ryzen 7 3700x, 64 GB of ram, and a Nvidia 1060 6GB.

I think the only thing you can try to do is bring down the batch size and see which would work for you. I don’t think 1060 supports fp16 like a 1070ti might but you can try that and see if you can get past the hurdle. With smaller RAM, unfortunately you sort of have to deal with such annoyances. I recall doing that cell in a reasonable time for a 1070ti with tofp16() and a batch size of 16 or maybe even 8. 1070ti is 8GB btw.

Hi, Yes this can happen, the model may increase the memory as the derivatives are being calculated over time.

Pytorch uses the approaches to save the derivatives of every layer from each iteration, and this increases fast the memory usage up to some point the Out of Memory happens. If you detect that your GPU cannot handle the number of generated parameters for your model try to test in a newer GPU like in Google Collab, then see if the error vanishes. After you can see how much memory the GPU there (collab), was used to process everything.

Or as @mike.moloch said you could try to diminish the size of each batch until you have a model that can be handled by your GPU the problem is that as you decrease the batch size, is better to you increase the number of epochs, because your model will have a harder time to converge.

1 Like

If you can’t figure out anything better than decreasing the batch size, you might want to use this gradient accumulation logic that Jeremy talks about in one of his videos: Lesson 7: Practical Deep Learning for Coders 2022 - YouTube

This allows you train the model well even if your GPU can only handle a small batch size. Basically, the batch size you use to update your model doesn’t have to match the batch size that you run physically on the GPU

2 Likes