QRNN RuntimeError: CUDA out of memory

JakobV · November 11, 2020, 2:38pm

Works fine is I change AWD_QRNN to AWD_LSTM

To reproduce:

path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
dls = TextDataLoaders.from_df(df, path=path, text_col='text', label_col='label', valid_col='is_valid')
learn = text_classifier_learner(dls, AWD_QRNN)

Got error:
---------------------------------------------------------------------------

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    195             self.epoch=epoch
--> 196             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    197 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    189     def _do_epoch(self):
--> 190         self._do_epoch_train()
    191         self._do_epoch_validate()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    181         self.dl = self.dls.train
--> 182         self._with_events(self.all_batches, 'train', CancelTrainException)
    183 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    159         self.n_iter = len(self.dl)
--> 160         for o in enumerate(self.dl): self.one_batch(*o)
    161 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    177         self._split(b)
--> 178         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    179 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    162     def _do_one_batch(self):
--> 163         self.pred = self.model(*self.xb)
    164         self('after_pred')

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/container.py in forward(self, input)
    116         for module in self:
--> 117             input = module(input)
    118         return input

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/core.py in forward(self, input)
     80             real_bs = (input[:,i] != self.pad_idx).long().sum()
---> 81             o = self.module(input[:real_bs,i: min(i+self.bptt, sl)])
     82             if self.max_len is None or sl-i <= self.max_len:

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/awdlstm.py in forward(self, inp, from_embeds)
    105         for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
--> 106             output, new_h = rnn(output, self.hidden[l])
    107             new_hidden.append(new_h)

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/qrnn.py in forward(self, inp, hid)
    161         for i, layer in enumerate(self.layers):
--> 162             inp, h = layer(inp, None if hid is None else hid[2*i if self.bidirectional else i])
    163             new_hid.append(h)

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/qrnn.py in forward(self, inp, hid)
    101     def forward(self, inp, hid=None):
--> 102         y = self.linear(self._get_source(inp))
    103         if self.output_gate: z_gate,f_gate,o_gate = y.chunk(3, dim=2)

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/awdlstm.py in forward(self, *args)
     52             warnings.simplefilter("ignore", category=UserWarning)
---> 53             return self.module(*args)
     54 

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/linear.py in forward(self, input)
     92     def forward(self, input: Tensor) -> Tensor:
---> 93         return F.linear(input, self.weight, self.bias)
     94 

/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py in linear(input, weight, bias)
   1686         if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
-> 1687             return handle_torch_function(linear, tens_ops, input, weight, bias=bias)
   1688     if input.dim() == 2 and bias is not None:

/opt/conda/lib/python3.8/site-packages/torch/overrides.py in handle_torch_function(public_api, relevant_args, *args, **kwargs)
   1062         # implementations can do equality/identity comparisons.
-> 1063         result = overloaded_arg.__torch_function__(public_api, types, args, kwargs)
   1064 

/opt/conda/lib/python3.8/site-packages/fastai/torch_core.py in __torch_function__(self, func, types, args, kwargs)
    316     def __torch_function__(self, func, types, args=(), kwargs=None):
--> 317         with torch._C.DisableTorchFunction(): ret = _convert(func(*args, **(kwargs or {})), self.__class__)
    318         if isinstance(ret, TensorBase): ret.set_meta(self, as_copy=True)

/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py in linear(input, weight, bias)
   1691     else:
-> 1692         output = input.matmul(weight.t())
   1693         if bias is not None:

RuntimeError: CUDA out of memory. Tried to allocate 82.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 58.44 MiB free; 10.73 GiB reserved in total by PyTorch)

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-4-3bf775d5b609> in <module>
----> 1 lr_min,lr_steep = learn.lr_find()

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in lr_find(self, start_lr, end_lr, num_it, stop_div, show_plot, suggestions)
    222     n_epoch = num_it//len(self.dls.train) + 1
    223     cb=LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
--> 224     with self.no_logging(): self.fit(n_epoch, cbs=cb)
    225     if show_plot: self.recorder.plot_lr_find()
    226     if suggestions:

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    203             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    204             self.n_epoch = n_epoch
--> 205             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    206 
    207     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
--> 156         finally:   self(f'after_{event_type}')        ;final()
    157 
    158     def all_batches(self):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in __call__(self, event_name)
    130     def ordered_cbs(self, event): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, event)]
    131 
--> 132     def __call__(self, event_name): L(event_name).map(self._call_one)
    133 
    134     def _call_one(self, event_name):

/opt/conda/lib/python3.8/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
    224     def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
    225 
--> 226     def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
    227     def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
    228     def filter(self, f=noop, negate=False, gen=False, **kwargs):

/opt/conda/lib/python3.8/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
    541     res = map(g, iterable)
    542     if gen: return res
--> 543     return list(res)
    544 
    545 # Cell

/opt/conda/lib/python3.8/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
    531             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    532         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 533         return self.func(*fargs, **kwargs)
    534 
    535 # Cell

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _call_one(self, event_name)
    134     def _call_one(self, event_name):
    135         assert hasattr(event, event_name), event_name
--> 136         [cb(event_name) for cb in sort_by_run(self.cbs)]
    137 
    138     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in <listcomp>(.0)
    134     def _call_one(self, event_name):
    135         assert hasattr(event, event_name), event_name
--> 136         [cb(event_name) for cb in sort_by_run(self.cbs)]
    137 
    138     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

/opt/conda/lib/python3.8/site-packages/fastai/callback/core.py in __call__(self, event_name)
     42                (self.run_valid and not getattr(self, 'training', False)))
     43         res = None
---> 44         if self.run and _run: res = getattr(self, event_name, noop)()
     45         if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
     46         return res

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in after_fit(self)
    192         tmp_f = self.path/self.model_dir/'_tmp.pth'
    193         if tmp_f.exists():
--> 194             self.learn.load('_tmp', with_opt=True)
    195             os.remove(tmp_f)
    196 

/opt/conda/lib/python3.8/site-packages/fastai/text/learner.py in load(self, file, with_opt, device, **kwargs)
    125         if self.opt is None: self.create_opt()
    126         file = join_path_file(file, self.path/self.model_dir, ext='.pth')
--> 127         load_model_text(file, self.model, self.opt, device=device, **kwargs)
    128         return self
    129 

/opt/conda/lib/python3.8/site-packages/fastai/text/learner.py in load_model_text(file, model, opt, with_opt, device, strict)
     71     if isinstance(device, int): device = torch.device('cuda', device)
     72     elif device is None: device = 'cpu'
---> 73     state = torch.load(file, map_location=device)
     74     hasopt = set(state)=={'model', 'opt'}
     75     model_state = state['model'] if hasopt else state

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
    592                     opened_file.seek(orig_position)
    593                     return torch.jit.load(opened_file)
--> 594                 return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
    595         return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
    596 

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in _load(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)
    851     unpickler = pickle_module.Unpickler(data_file, **pickle_load_args)
    852     unpickler.persistent_load = persistent_load
--> 853     result = unpickler.load()
    854 
    855     torch._utils._validate_loaded_sparse_tensors()

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in persistent_load(saved_id)
    843         data_type, key, location, size = data
    844         if key not in loaded_storages:
--> 845             load_tensor(data_type, size, key, _maybe_decode_ascii(location))
    846         storage = loaded_storages[key]
    847         return storage

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in load_tensor(data_type, size, key, location)
    832 
    833         storage = zip_file.get_storage_from_record(name, size, dtype).storage()
--> 834         loaded_storages[key] = restore_location(storage, location)
    835 
    836     def persistent_load(saved_id):

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in restore_location(storage, loc   ---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    195             self.epoch=epoch
--> 196             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    197 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    189     def _do_epoch(self):
--> 190         self._do_epoch_train()
    191         self._do_epoch_validate()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    181         self.dl = self.dls.train
--> 182         self._with_events(self.all_batches, 'train', CancelTrainException)
    183 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    159         self.n_iter = len(self.dl)
--> 160         for o in enumerate(self.dl): self.one_batch(*o)
    161 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    177         self._split(b)
--> 178         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    179 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    162     def _do_one_batch(self):
--> 163         self.pred = self.model(*self.xb)
    164         self('after_pred')

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/container.py in forward(self, input)
    116         for module in self:
--> 117             input = module(input)
    118         return input

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/core.py in forward(self, input)
     80             real_bs = (input[:,i] != self.pad_idx).long().sum()
---> 81             o = self.module(input[:real_bs,i: min(i+self.bptt, sl)])
     82             if self.max_len is None or sl-i <= self.max_len:

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/awdlstm.py in forward(self, inp, from_embeds)
    105         for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
--> 106             output, new_h = rnn(output, self.hidden[l])
    107             new_hidden.append(new_h)

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/qrnn.py in forward(self, inp, hid)
    161         for i, layer in enumerate(self.layers):
--> 162             inp, h = layer(inp, None if hid is None else hid[2*i if self.bidirectional else i])
    163             new_hid.append(h)

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/qrnn.py in forward(self, inp, hid)
    101     def forward(self, inp, hid=None):
--> 102         y = self.linear(self._get_source(inp))
    103         if self.output_gate: z_gate,f_gate,o_gate = y.chunk(3, dim=2)

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/fastai/text/models/awdlstm.py in forward(self, *args)
     52             warnings.simplefilter("ignore", category=UserWarning)
---> 53             return self.module(*args)
     54 

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/linear.py in forward(self, input)
     92     def forward(self, input: Tensor) -> Tensor:
---> 93         return F.linear(input, self.weight, self.bias)
     94 

/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py in linear(input, weight, bias)
   1686         if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
-> 1687             return handle_torch_function(linear, tens_ops, input, weight, bias=bias)
   1688     if input.dim() == 2 and bias is not None:

/opt/conda/lib/python3.8/site-packages/torch/overrides.py in handle_torch_function(public_api, relevant_args, *args, **kwargs)
   1062         # implementations can do equality/identity comparisons.
-> 1063         result = overloaded_arg.__torch_function__(public_api, types, args, kwargs)
   1064 

/opt/conda/lib/python3.8/site-packages/fastai/torch_core.py in __torch_function__(self, func, types, args, kwargs)
    316     def __torch_function__(self, func, types, args=(), kwargs=None):
--> 317         with torch._C.DisableTorchFunction(): ret = _convert(func(*args, **(kwargs or {})), self.__class__)
    318         if isinstance(ret, TensorBase): ret.set_meta(self, as_copy=True)

/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py in linear(input, weight, bias)
   1691     else:
-> 1692         output = input.matmul(weight.t())
   1693         if bias is not None:

RuntimeError: CUDA out of memory. Tried to allocate 82.00 MiB (GPU 0; 11.17 GiB total capacity; 10.43 GiB already allocated; 58.44 MiB free; 10.73 GiB reserved in total by PyTorch)

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-4-3bf775d5b609> in <module>
----> 1 lr_min,lr_steep = learn.lr_find()

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in lr_find(self, start_lr, end_lr, num_it, stop_div, show_plot, suggestions)
    222     n_epoch = num_it//len(self.dls.train) + 1
    223     cb=LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
--> 224     with self.no_logging(): self.fit(n_epoch, cbs=cb)
    225     if show_plot: self.recorder.plot_lr_find()
    226     if suggestions:

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    203             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    204             self.n_epoch = n_epoch
--> 205             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    206 
    207     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
--> 156         finally:   self(f'after_{event_type}')        ;final()
    157 
    158     def all_batches(self):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in __call__(self, event_name)
    130     def ordered_cbs(self, event): return [cb for cb in sort_by_run(self.cbs) if hasattr(cb, event)]
    131 
--> 132     def __call__(self, event_name): L(event_name).map(self._call_one)
    133 
    134     def _call_one(self, event_name):

/opt/conda/lib/python3.8/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
    224     def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
    225 
--> 226     def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
    227     def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
    228     def filter(self, f=noop, negate=False, gen=False, **kwargs):

/opt/conda/lib/python3.8/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
    541     res = map(g, iterable)
    542     if gen: return res
--> 543     return list(res)
    544 
    545 # Cell

/opt/conda/lib/python3.8/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
    531             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    532         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 533         return self.func(*fargs, **kwargs)
    534 
    535 # Cell

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _call_one(self, event_name)
    134     def _call_one(self, event_name):
    135         assert hasattr(event, event_name), event_name
--> 136         [cb(event_name) for cb in sort_by_run(self.cbs)]
    137 
    138     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in <listcomp>(.0)
    134     def _call_one(self, event_name):
    135         assert hasattr(event, event_name), event_name
--> 136         [cb(event_name) for cb in sort_by_run(self.cbs)]
    137 
    138     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

/opt/conda/lib/python3.8/site-packages/fastai/callback/core.py in __call__(self, event_name)
     42                (self.run_valid and not getattr(self, 'training', False)))
     43         res = None
---> 44         if self.run and _run: res = getattr(self, event_name, noop)()
     45         if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
     46         return res

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in after_fit(self)
    192         tmp_f = self.path/self.model_dir/'_tmp.pth'
    193         if tmp_f.exists():
--> 194             self.learn.load('_tmp', with_opt=True)
    195             os.remove(tmp_f)
    196 

/opt/conda/lib/python3.8/site-packages/fastai/text/learner.py in load(self, file, with_opt, device, **kwargs)
    125         if self.opt is None: self.create_opt()
    126         file = join_path_file(file, self.path/self.model_dir, ext='.pth')
--> 127         load_model_text(file, self.model, self.opt, device=device, **kwargs)
    128         return self
    129 

/opt/conda/lib/python3.8/site-packages/fastai/text/learner.py in load_model_text(file, model, opt, with_opt, device, strict)
     71     if isinstance(device, int): device = torch.device('cuda', device)
     72     elif device is None: device = 'cpu'
---> 73     state = torch.load(file, map_location=device)
     74     hasopt = set(state)=={'model', 'opt'}
     75     model_state = state['model'] if hasopt else state

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
    592                     opened_file.seek(orig_position)
    593                     return torch.jit.load(opened_file)
--> 594                 return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
    595         return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
    596 

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in _load(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)
    851     unpickler = pickle_module.Unpickler(data_file, **pickle_load_args)
    852     unpickler.peration)
    812     elif isinstance(map_location, torch.device):
    813         def restore_location(storage, location):
--> 814             return default_restore_location(storage, str(map_location))
    815     else:
    816         def restore_location(storage, location):

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in default_restore_location(storage, location)
    173 def default_restore_location(storage, location):
    174     for _, _, fn in _package_registry:
--> 175         result = fn(storage, location)
    176         if result is not None:
    177             return result

/opt/conda/lib/python3.8/site-packages/torch/serialization.py in _cuda_deserialize(obj, location)
    155                 return storage_type(obj.size())
    156         else:
--> 157             return obj.cuda(device)
    158 
    159 

/opt/conda/lib/python3.8/site-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)
     77         else:
     78             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 79             return new_type(self.size()).copy_(self, non_blocking)
     80 
     81 

/opt/conda/lib/python3.8/site-packages/torch/cuda/__init__.py in _lazy_new(cls, *args, **kwargs)
    460     # We may need to call lazy init again if we are a forked child
    461     # del _CudaBase.__new__
--> 462     return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
    463 
    464 

RuntimeError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 11.17 GiB total capacity; 10.48 GiB already allocated; 14.44 MiB free; 10.77 GiB reserved in total by PyTorch)

utkb · November 12, 2020, 9:52am

I think QRNN needs more GPU-mem than LSTM? So it’s just telling you that your GPU does not have enough mem for the model?

I haven’t used fastai.text in a while. Not sure if you’ll be able to do learn.to_fp16() half-prec to reduce the GPU-mem requirement?

Otherwise, I guess you’ll need to try a different GPU with more GPU-mem… Good luck.

Yijin

JakobV · November 12, 2020, 2:06pm

@utkb Thanks for replying!

I reduced the batch size in the dataloader, witch solved it.

Then I ran into this instead hehe xD

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-ca61b3aa75fc> in <module>
----> 1 learn.fine_tune(1)

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    158     base_lr /= 2
    159     self.unfreeze()
--> 160     self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)
    161 
    162 # Cell

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    110     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    111               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    113 
    114 # Cell

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    203             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    204             self.n_epoch = n_epoch
--> 205             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    206 
    207     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    194         for epoch in range(self.n_epoch):
    195             self.epoch=epoch
--> 196             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    197 
    198     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    188 
    189     def _do_epoch(self):
--> 190         self._do_epoch_train()
    191         self._do_epoch_validate()
    192 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    180     def _do_epoch_train(self):
    181         self.dl = self.dls.train
--> 182         self._with_events(self.all_batches, 'train', CancelTrainException)
    183 
    184     def _do_epoch_validate(self, ds_idx=1, dl=None):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    158     def all_batches(self):
    159         self.n_iter = len(self.dl)
--> 160         for o in enumerate(self.dl): self.one_batch(*o)
    161 
    162     def _do_one_batch(self):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    176         self.iter = i
    177         self._split(b)
--> 178         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    179 
    180     def _do_epoch_train(self):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    167         if not self.training or not len(self.yb): return
    168         self('before_backward')
--> 169         self._backward()
    170         self('after_backward')
    171         self._step()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _backward(self)
    149 
    150     def _step(self): self.opt.step()
--> 151     def _backward(self): self.loss.backward()
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):

/opt/conda/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    212         from torch.overrides import has_torch_function, handle_torch_function
    213         if type(self) is not Tensor and has_torch_function(relevant_args):
--> 214             return handle_torch_function(
    215                 Tensor.backward,
    216                 relevant_args,

/opt/conda/lib/python3.8/site-packages/torch/overrides.py in handle_torch_function(public_api, relevant_args, *args, **kwargs)
   1061         # Use `public_api` instead of `implementation` so __torch_function__
   1062         # implementations can do equality/identity comparisons.
-> 1063         result = overloaded_arg.__torch_function__(public_api, types, args, kwargs)
   1064 
   1065         if result is not NotImplemented:

/opt/conda/lib/python3.8/site-packages/fastai/torch_core.py in __torch_function__(self, func, types, args, kwargs)
    315 
    316     def __torch_function__(self, func, types, args=(), kwargs=None):
--> 317         with torch._C.DisableTorchFunction(): ret = _convert(func(*args, **(kwargs or {})), self.__class__)
    318         if isinstance(ret, TensorBase): ret.set_meta(self, as_copy=True)
    319         return ret

/opt/conda/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    219                 retain_graph=retain_graph,
    220                 create_graph=create_graph)
--> 221         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    222 
    223     def register_hook(self, hook):

/opt/conda/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    128         retain_graph = create_graph
    129 
--> 130     Variable._execution_engine.run_backward(
    131         tensors, grad_tensors_, retain_graph, create_graph,
    132         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2, 40, 400]], which is output 0 of TanhBackward, is at version 3; expected version 2 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

To reproduce:

import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.text.all import * 
import pandas as pd

path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
dls = TextDataLoaders.from_df(df, path=path, text_col='text', label_col='label', valid_col='is_valid', bs=8)
learn = text_classifier_learner(dls, 
                                AWD_QRNN,
                                drop_mult=0.5,
                                metrics=accuracy )
del df

learn.fine_tune(1)

utkb · November 13, 2020, 9:06am

RuntimeError message seems to suggest that it’s a PyTorch version mismatch. Check the versions for your installation: fastai, fastcore, fastbook, pytorch, torchvision, vs. the recommended/required ones on fastai’s Github repo?

Googling the error message returns a number of workarounds (when getting this error between different PyTorch versions), though it sounds like they will likely incur a performance penalty. Best sort out a software stack that is consistent with the fastai version you are using.

Yijin

JakobV · November 16, 2020, 11:20am

Thank you!