Hi all,
I am trying to learn fastai. I’ve watched lesson 1 and gone thru most of the quickstart guide. The NLP quickstart however, will never finish training. (Stack Trace below)
RuntimeError Traceback (most recent call last)
Cell In [3], line 3
1 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
2 learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
----> 3 learn.fine_tune(2, 1e-2)
File ~/ML/venv/lib/python3.10/site-packages/fastai/callback/schedule.py:168, in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
166 base_lr /= 2
167 self.unfreeze()
--> 168 self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)
File ~/ML/venv/lib/python3.10/site-packages/fastai/callback/schedule.py:119, in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)
116 lr_max = np.array([h['lr'] for h in self.opt.hypers])
117 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
118 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 119 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:256, in Learner.fit(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)
254 self.opt.set_hypers(lr=self.lr if lr is None else lr)
255 self.n_epoch = n_epoch
--> 256 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:245, in Learner._do_fit(self)
243 for epoch in range(self.n_epoch):
244 self.epoch=epoch
--> 245 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:239, in Learner._do_epoch(self)
238 def _do_epoch(self):
--> 239 self._do_epoch_train()
240 self._do_epoch_validate()
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:231, in Learner._do_epoch_train(self)
229 def _do_epoch_train(self):
230 self.dl = self.dls.train
--> 231 self._with_events(self.all_batches, 'train', CancelTrainException)
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:199, in Learner.all_batches(self)
197 def all_batches(self):
198 self.n_iter = len(self.dl)
--> 199 for o in enumerate(self.dl): self.one_batch(*o)
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:227, in Learner.one_batch(self, i, b)
225 b = self._set_device(b)
226 self._split(b)
--> 227 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:193, in Learner._with_events(self, f, event_type, ex, final)
192 def _with_events(self, f, event_type, ex, final=noop):
--> 193 try: self(f'before_{event_type}'); f()
194 except ex: self(f'after_cancel_{event_type}')
195 self(f'after_{event_type}'); final()
File ~/ML/venv/lib/python3.10/site-packages/fastai/learner.py:205, in Learner._do_one_batch(self)
204 def _do_one_batch(self):
--> 205 self.pred = self.model(*self.xb)
206 self('after_pred')
207 if len(self.yb):
File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/container.py:139, in Sequential.forward(self, input)
137 def forward(self, input):
138 for module in self:
--> 139 input = module(input)
140 return input
File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~/ML/venv/lib/python3.10/site-packages/fastai/text/models/core.py:98, in SentenceEncoder.forward(self, input)
95 for i in range(0, sl, self.bptt):
96 #Note: this expects that sequence really begins on a round multiple of bptt
97 real_bs = (input[:,i] != self.pad_idx).long().sum()
---> 98 o = self.module(input[:real_bs,i: min(i+self.bptt, sl)])
99 if self.max_len is None or sl-i <= self.max_len:
100 outs.append(o)
File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~/ML/venv/lib/python3.10/site-packages/fastai/text/models/awdlstm.py:125, in AWD_LSTM.forward(self, inp, from_embeds)
122 bs,sl = inp.shape[:2] if from_embeds else inp.shape
123 if bs!=self.bs: self._change_hidden(bs)
--> 125 output = self.input_dp(inp if from_embeds else self.encoder_dp(inp))
126 new_hidden = []
127 for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
File ~/ML/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File ~/ML/venv/lib/python3.10/site-packages/fastai/text/models/awdlstm.py:86, in EmbeddingDropout.forward(self, words, scale)
84 size = (self.emb.weight.size(0),1)
85 mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
---> 86 masked_embed = self.emb.weight * mask
87 else: masked_embed = self.emb.weight
88 if scale: masked_embed.mul_(scale)
RuntimeError: CUDA out of memory. Tried to allocate 92.00 MiB (GPU 0; 5.93 GiB total capacity; 4.76 GiB already allocated; 58.00 MiB free; 4.87 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Using nvidia-smi, the CUDA is at about 2G (out of 6G) right up until it errors, where it suddenly jumps to 6G. I’ve tried lower batch sizes, they just error later. Eventually, it ends up taking around an hour. From what the lesson said this shouldn’t be the case, and it should be fairly quick. I have a Ryzen 7 3700x, 64 GB of ram, and a Nvidia 1060 6GB.