Fastai LSTM not running on CUDA/GPU

I’m following chapter 12 of fastbook, and once I switch model to LSTM, I can’t make it run on cuda/GPU. I’m making only one change to the code from the book:

learn.dls.device = ‘cuda’

When I run this code with that change:

class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    def forward(self, x):
        raw,h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),raw,out
    
    def reset(self): 
        for h in self.h: h.zero_()

learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                loss_func=CrossEntropyLossFlat(), metrics=accuracy,
                cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])

learn.fit_one_cycle(15, 1e-2, wd=0.1)

I get the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-96-23912738f0d8> in <module>()
----> 1 learn.fit_one_cycle(15, 1e-2, wd=0.1)

17 frames
/usr/local/lib/python3.6/dist-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

/usr/local/lib/python3.6/dist-packages/fastcore/logargs.py in _f(*args, **kwargs)
     54         init_args.update(log)
     55         setattr(inst, 'init_args', init_args)
---> 56         return inst if to_return else f(*args, **kwargs)
     57     return _f

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    205             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    206             self.n_epoch = n_epoch
--> 207             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    208 
    209     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_fit(self)
    195         for epoch in range(self.n_epoch):
    196             self.epoch=epoch
--> 197             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    198 
    199     @log_args(but='cbs')

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_epoch(self)
    189 
    190     def _do_epoch(self):
--> 191         self._do_epoch_train()
    192         self._do_epoch_validate()
    193 

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_epoch_train(self)
    181     def _do_epoch_train(self):
    182         self.dl = self.dls.train
--> 183         self._with_events(self.all_batches, 'train', CancelTrainException)
    184 
    185     def _do_epoch_validate(self, ds_idx=1, dl=None):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in all_batches(self)
    159     def all_batches(self):
    160         self.n_iter = len(self.dl)
--> 161         for o in enumerate(self.dl): self.one_batch(*o)
    162 
    163     def _do_one_batch(self):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in one_batch(self, i, b)
    177         self.iter = i
    178         self._split(b)
--> 179         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    180 
    181     def _do_epoch_train(self):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    153 
    154     def _with_events(self, f, event_type, ex, final=noop):
--> 155         try:       self(f'before_{event_type}')       ;f()
    156         except ex: self(f'after_cancel_{event_type}')
    157         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_one_batch(self)
    162 
    163     def _do_one_batch(self):
--> 164         self.pred = self.model(*self.xb)
    165         self('after_pred')
    166         if len(self.yb): self.loss = self.loss_func(self.pred, *self.yb)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

<ipython-input-90-653fa78dda44> in forward(self, x)
      9 
     10     def forward(self, x):
---> 11         raw,h = self.rnn(self.i_h(x), self.h)
     12         out = self.drop(raw)
     13         self.h = [h_.detach() for h_ in h]

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    575         if batch_sizes is None:
    576             result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
--> 577                               self.dropout, self.training, self.bidirectional, self.batch_first)
    578         else:
    579             result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,

RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu

My hypothesis is that the hidden state represented by a list is not interpreted as torch parameter and not moved to cuda. Is this a bug? Do you know how to fix this?

Try doing:

net = LMModel7(len(vocab).cuda()

And passing net to Learner

1 Like

Thanks Zach! Putting the model to cuda wasn’t sufficient, but I was finally able to make it work with the following line:
self.h = [torch.zeros(n_layers*2, BS, n_hidden).to('cuda') for _ in range(2)]

2 Likes

RuntimeError Traceback (most recent call last)
in ()
----> 1 learn.model(x).shape

5 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
–> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/fastai/text/models/awdlstm.py in forward(self, inp, from_embeds)
104 new_hidden = []
105 for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
–> 106 output, new_h = rnn(output, self.hidden[l])
107 new_hidden.append(new_h)
108 if l != self.n_layers - 1: output = hid_dp(output)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
–> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/fastai/text/models/awdlstm.py in forward(self, *args)
51 # To avoid the warning that comes because the weights aren’t flattened.
52 warnings.simplefilter(“ignore”, category=UserWarning)
—> 53 return self.module(*args)
54
55 def reset(self):

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
–> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
580 if batch_sizes is None:
581 result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
–> 582 self.dropout, self.training, self.bidirectional, self.batch_first)
583 else:
584 result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,

RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu

Please help as I am getting this runtime error

Good question. It appears that when running AWD_LSTM via fastai helpers, like text_classifier_learner, the hidden state of LSTM gets moved to cuda properly. But when we run AWD_LSTM on its own, the hidden state remains on cpu. I haven’t found the magic that moves that hidden state to cuda. Maybe someone more experienced like @muellerzr could have a clue?

I answered this question here:https://github.com/fastai/fastai/issues/3158
The insight is that the hidden states are not parameters of the model, so they are not moved to GPU by the cuda helper.
So you need to do it yourself, by doing model.reset() or adding the ModelResetter callback to the learner.
Indeed, when you use a text_learner the ModelResetter callback is added automatically.
TL:DR

For your particular case of Model7, indeed, this model will not put the hidden params on GPU when doing reset. The trick is to use the function one_param from fastai that will take the device where the params of the model are stored.
replace your reset with:

def zero_h(self): 
      return [one_param(self).new_zeros(*h_.shape) for h_ in self.h]

def reset(self):
     self.h = self.zero_h()
1 Like

Awesome, thanks for the explanation Thomas!

=)
I faced the same issues one year ago. Re-implementing the AWD for my own problem helped me to understand all this.
one_param is super handy!

1 Like