Cuda: Out of Memory Error

Kamui · May 1, 2023, 6:05pm

Hello
I am currently following lesson 4 of fast.ai course part 1 2022.
I am trying to run every line of code found in the book (Chapter 10)
So I have this learner:

learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult = 0.3,
    metrics = [accuracy, Perplexity()]).to_fp16()

but when I run this line:

learn.fit_one_cycle(1, 2e-2)

I get the following error:

OutOfMemoryError Traceback (most recent call last)
Cell In[26], line 1
----> 1 learn.fit_one_cycle(1, 2e-2)

File ~/mambaforge/lib/python3.10/site-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
116 lr_max = np.array([h[‘lr’] for h in self.opt.hypers])
117 scheds = {‘lr’: combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
118 ‘mom’: combined_cos(pct_start, *(self.moms if moms is None else moms))}
→ 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:256, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
254 self.opt.set_hypers(lr=self.lr if lr is None else lr)
255 self.n_epoch = n_epoch
→ 256 self._with_events(self._do_fit, ‘fit’, CancelFitException, self._end_cleanup)

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:193, in Learner.with_events(self, f, event_type, ex, final)
192 def with_events(self, f, event_type, ex, final=noop):
→ 193 try: self(f’before{event_type}'); f()
194 except ex: self(f’after_cancel{event_type}‘)
195 self(f’after_{event_type}’); final()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:245, in Learner._do_fit(self)
243 for epoch in range(self.n_epoch):
244 self.epoch=epoch
→ 245 self._with_events(self._do_epoch, ‘epoch’, CancelEpochException)

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:193, in Learner.with_events(self, f, event_type, ex, final)
192 def with_events(self, f, event_type, ex, final=noop):
→ 193 try: self(f’before{event_type}'); f()
194 except ex: self(f’after_cancel{event_type}‘)
195 self(f’after_{event_type}’); final()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:239, in Learner._do_epoch(self)
238 def _do_epoch(self):
→ 239 self._do_epoch_train()
240 self._do_epoch_validate()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:231, in Learner._do_epoch_train(self)
229 def _do_epoch_train(self):
230 self.dl = self.dls.train
→ 231 self._with_events(self.all_batches, ‘train’, CancelTrainException)

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:193, in Learner.with_events(self, f, event_type, ex, final)
192 def with_events(self, f, event_type, ex, final=noop):
→ 193 try: self(f’before{event_type}'); f()
194 except ex: self(f’after_cancel{event_type}‘)
195 self(f’after_{event_type}’); final()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:199, in Learner.all_batches(self)
197 def all_batches(self):
198 self.n_iter = len(self.dl)
→ 199 for o in enumerate(self.dl): self.one_batch(*o)

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:227, in Learner.one_batch(self, i, b)
225 b = self._set_device(b)
226 self._split(b)
→ 227 self._with_events(self._do_one_batch, ‘batch’, CancelBatchException)

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:193, in Learner.with_events(self, f, event_type, ex, final)
192 def with_events(self, f, event_type, ex, final=noop):
→ 193 try: self(f’before{event_type}'); f()
194 except ex: self(f’after_cancel{event_type}‘)
195 self(f’after_{event_type}’); final()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:212, in Learner._do_one_batch(self)
210 self(‘after_loss’)
211 if not self.training or not len(self.yb): return
→ 212 self._with_events(self._backward, ‘backward’, CancelBackwardException)
213 self._with_events(self._step, ‘step’, CancelStepException)
214 self.opt.zero_grad()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:193, in Learner.with_events(self, f, event_type, ex, final)
192 def with_events(self, f, event_type, ex, final=noop):
→ 193 try: self(f’before{event_type}'); f()
194 except ex: self(f’after_cancel{event_type}‘)
195 self(f’after_{event_type}’); final()

File ~/mambaforge/lib/python3.10/site-packages/fastai/learner.py:201, in Learner._backward(self)
→ 201 def _backward(self): self.loss_grad.backward()

File ~/mambaforge/lib/python3.10/site-packages/torch/_tensor.py:479, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
432 r""“Computes the gradient of current tensor w.r.t. graph leaves.
433
434 The graph is differentiated using the chain rule. If the tensor is
(…)
476 used to compute the attr::tensors.
477 “””
478 if has_torch_function_unary(self):
→ 479 return handle_torch_function(
480 Tensor.backward,
481 (self,),
482 self,
483 gradient=gradient,
484 retain_graph=retain_graph,
485 create_graph=create_graph,
486 inputs=inputs,
487 )
488 torch.autograd.backward(
489 self, gradient, retain_graph, create_graph, inputs=inputs
490 )

File ~/mambaforge/lib/python3.10/site-packages/torch/overrides.py:1534, in handle_torch_function(public_api, relevant_args, *args, **kwargs)
1528 warnings.warn("Defining your __torch_function__ as a plain method is deprecated and " 1529 "will be an error in future, please define it as a classmethod.", 1530 DeprecationWarning) 1532 # Use public_apiinstead ofimplementation` so torch_function
1533 # implementations can do equality/identity comparisons.
→ 1534 result = torch_func_method(public_api, types, args, kwargs)
1536 if result is not NotImplemented:
1537 return result

File ~/mambaforge/lib/python3.10/site-packages/fastai/torch_core.py:378, in TensorBase.torch_function(cls, func, types, args, kwargs)
376 if cls.debug and func.name not in (‘str’,‘repr’): print(func, types, args, kwargs)
377 if _torch_handled(args, cls._opt, func): types = (torch.Tensor,)
→ 378 res = super().torch_function(func, types, args, ifnone(kwargs, {}))
379 dict_objs = _find_args(args) if args else _find_args(list(kwargs.values()))
380 if issubclass(type(res),TensorBase) and dict_objs: res.set_meta(dict_objs[0],as_copy=True)

File ~/mambaforge/lib/python3.10/site-packages/torch/_tensor.py:1279, in Tensor.torch_function(cls, func, types, args, kwargs)
1276 return NotImplemented
1278 with _C.DisableTorchFunction():
→ 1279 ret = func(*args, **kwargs)
1280 if func in get_default_nowrap_functions():
1281 return ret

File ~/mambaforge/lib/python3.10/site-packages/torch/_tensor.py:488, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
478 if has_torch_function_unary(self):
479 return handle_torch_function(
480 Tensor.backward,
481 (self,),
(…)
486 inputs=inputs,
487 )
→ 488 torch.autograd.backward(
489 self, gradient, retain_graph, create_graph, inputs=inputs
490 )

File ~/mambaforge/lib/python3.10/site-packages/torch/autograd/init.py:197, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
192 retain_graph = create_graph
194 # The reason we repeat same the comment below is that
195 # some Python versions print out the first line of a multi-line function
196 # calls in the traceback and some print out the last line
→ 197 Variable.execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
198 tensors, grad_tensors, retain_graph, create_graph, inputs,
199 allow_unreachable=True, accumulate_grad=True)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.29 GiB (GPU 0; 8.00 GiB total capacity; 5.34 GiB already allocated; 1.10 GiB free; 5.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

I have a laptop with an Nvidia 1070 (8Go of VRAM).
I’ve already tried restarting my laptop
I’ve also tried

import torch
torch.cuda.empty_cache()

and the problem is still there.

Thank you for your help.

ali_baba · May 1, 2023, 6:26pm

Can you try to reduce the batch size of your dataloders? I believe that should be sufficient to help you proceed with running the notebook. The associated code/cell from the chapter 10 notebook can be seen below:

dls_lm = DataBlock( blocks=TextBlock.from_folder(path, is_lm=True), get_items=get_imdb, splitter=RandomSplitter(0.1) ).dataloaders(path, path=path, bs=128, seq_len=80) → so you want bring the value of the bs [batch size] parameter down

Kamui · May 1, 2023, 8:46pm

Thank you it works, so the batch size was too big for my GPU, I lowered it to 64 and it worked.