I’m following chapter 12 of fastbook, and once I switch model to LSTM, I can’t make it run on cuda/GPU. I’m making only one change to the code from the book:
learn.dls.device = ‘cuda’
When I run this code with that change:
class LMModel7(Module):
def __init__(self, vocab_sz, n_hidden, n_layers, p):
self.i_h = nn.Embedding(vocab_sz, n_hidden)
self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
self.drop = nn.Dropout(p)
self.h_o = nn.Linear(n_hidden, vocab_sz)
self.h_o.weight = self.i_h.weight
self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
def forward(self, x):
raw,h = self.rnn(self.i_h(x), self.h)
out = self.drop(raw)
self.h = [h_.detach() for h_ in h]
return self.h_o(out),raw,out
def reset(self):
for h in self.h: h.zero_()
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),
loss_func=CrossEntropyLossFlat(), metrics=accuracy,
cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])
learn.fit_one_cycle(15, 1e-2, wd=0.1)
I get the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-96-23912738f0d8> in <module>()
----> 1 learn.fit_one_cycle(15, 1e-2, wd=0.1)
17 frames
/usr/local/lib/python3.6/dist-packages/fastcore/logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, 'init_args', init_args)
---> 56 return inst if to_return else f(*args, **kwargs)
57 return _f
/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
111 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
112 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
114
115 # Cell
/usr/local/lib/python3.6/dist-packages/fastcore/logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, 'init_args', init_args)
---> 56 return inst if to_return else f(*args, **kwargs)
57 return _f
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
205 self.opt.set_hypers(lr=self.lr if lr is None else lr)
206 self.n_epoch = n_epoch
--> 207 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
208
209 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
153
154 def _with_events(self, f, event_type, ex, final=noop):
--> 155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
157 finally: self(f'after_{event_type}') ;final()
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_fit(self)
195 for epoch in range(self.n_epoch):
196 self.epoch=epoch
--> 197 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
198
199 @log_args(but='cbs')
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
153
154 def _with_events(self, f, event_type, ex, final=noop):
--> 155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
157 finally: self(f'after_{event_type}') ;final()
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_epoch(self)
189
190 def _do_epoch(self):
--> 191 self._do_epoch_train()
192 self._do_epoch_validate()
193
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_epoch_train(self)
181 def _do_epoch_train(self):
182 self.dl = self.dls.train
--> 183 self._with_events(self.all_batches, 'train', CancelTrainException)
184
185 def _do_epoch_validate(self, ds_idx=1, dl=None):
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
153
154 def _with_events(self, f, event_type, ex, final=noop):
--> 155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
157 finally: self(f'after_{event_type}') ;final()
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in all_batches(self)
159 def all_batches(self):
160 self.n_iter = len(self.dl)
--> 161 for o in enumerate(self.dl): self.one_batch(*o)
162
163 def _do_one_batch(self):
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in one_batch(self, i, b)
177 self.iter = i
178 self._split(b)
--> 179 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
180
181 def _do_epoch_train(self):
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
153
154 def _with_events(self, f, event_type, ex, final=noop):
--> 155 try: self(f'before_{event_type}') ;f()
156 except ex: self(f'after_cancel_{event_type}')
157 finally: self(f'after_{event_type}') ;final()
/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_one_batch(self)
162
163 def _do_one_batch(self):
--> 164 self.pred = self.model(*self.xb)
165 self('after_pred')
166 if len(self.yb): self.loss = self.loss_func(self.pred, *self.yb)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
<ipython-input-90-653fa78dda44> in forward(self, x)
9
10 def forward(self, x):
---> 11 raw,h = self.rnn(self.i_h(x), self.h)
12 out = self.drop(raw)
13 self.h = [h_.detach() for h_ in h]
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
575 if batch_sizes is None:
576 result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
--> 577 self.dropout, self.training, self.bidirectional, self.batch_first)
578 else:
579 result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu
My hypothesis is that the hidden state represented by a list is not interpreted as torch parameter and not moved to cuda. Is this a bug? Do you know how to fix this?