Following steps from Lesson 1 won't train a model

TheNewGuy · October 17, 2020, 6:01pm

When I try to follow the steps from Lesson 1 to train the first model, I get pages of errors which I cannot interpret. The only difference I see is that on the video for lesson 1, it says “from fastai2.vision.all” on the video and it says “from fastai.vision.all”. However, it doesn’t work if I change it to “fastai2”.
Did I somehow setup an old version of the Notebook instead of the latest? But even if that were the case, should it not still run?

This is the code I try to run:

And I get these errors

epoch train_loss valid_loss time
0 3.513519 00:12

RuntimeError Traceback (most recent call last)
in
7
8 learn = unet_learner(dls, resnet34)
----> 9 learn.fine_tune(8)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, ‘init_args’, init_args)
—> 56 return inst if to_return else f(*args, **kwargs)
57 return _f

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
159 “Fine tune with freeze for freeze_epochs then with unfreeze from epochs using discriminative LR”
160 self.freeze()
–> 161 self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
162 base_lr /= 2
163 self.unfreeze()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, ‘init_args’, init_args)
—> 56 return inst if to_return else f(*args, **kwargs)
57 return _f

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
111 scheds = {‘lr’: combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
112 ‘mom’: combined_cos(pct_start, *(self.moms if moms is None else moms))}
–> 113 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
114
115 # Cell

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/logargs.py in _f(*args, **kwargs)
54 init_args.update(log)
55 setattr(inst, ‘init_args’, init_args)
—> 56 return inst if to_return else f(*args, **kwargs)
57 return _f

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
205 self.opt.set_hypers(lr=self.lr if lr is None else lr)
206 self.n_epoch = n_epoch
–> 207 self._with_events(self._do_fit, ‘fit’, CancelFitException, self._end_cleanup)
208
209 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before{event_type}’) ;f()
156 except ex: self(f’after_cancel{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
195 for epoch in range(self.n_epoch):
196 self.epoch=epoch
–> 197 self._with_events(self._do_epoch, ‘epoch’, CancelEpochException)
198
199 @log_args(but=‘cbs’)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before{event_type}’) ;f()
156 except ex: self(f’after_cancel{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
189
190 def _do_epoch(self):
–> 191 self._do_epoch_train()
192 self._do_epoch_validate()
193

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
181 def _do_epoch_train(self):
182 self.dl = self.dls.train
–> 183 self._with_events(self.all_batches, ‘train’, CancelTrainException)
184
185 def _do_epoch_validate(self, ds_idx=1, dl=None):

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before{event_type}’) ;f()
156 except ex: self(f’after_cancel{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
159 def all_batches(self):
160 self.n_iter = len(self.dl)
–> 161 for o in enumerate(self.dl): self.one_batch(*o)
162
163 def _do_one_batch(self):

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
177 self.iter = i
178 self._split(b)
–> 179 self._with_events(self._do_one_batch, ‘batch’, CancelBatchException)
180
181 def _do_epoch_train(self):

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in with_events(self, f, event_type, ex, final)
153
154 def with_events(self, f, event_type, ex, final=noop):
–> 155 try: self(f’before{event_type}’) ;f()
156 except ex: self(f’after_cancel{event_type}’)
157 finally: self(f’after_{event_type}’) ;final()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
168 if not self.training or not len(self.yb): return
169 self(‘before_backward’)
–> 170 self._backward()
171 self(‘after_backward’)
172 self._step()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/learner.py in _backward(self)
150
151 def _step(self): self.opt.step()
–> 152 def _backward(self): self.loss.backward()
153
154 def _with_events(self, f, event_type, ex, final=noop):

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
183 products. Defaults to False.
184 “”"
–> 185 torch.autograd.backward(self, gradient, retain_graph, create_graph)
186
187 def register_hook(self, hook):

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/autograd/init.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
123 retain_graph = create_graph
124
–> 125 Variable._execution_engine.run_backward(
126 tensors, grad_tensors, retain_graph, create_graph,
127 allow_unreachable=True) # allow_unreachable flag

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/_utils/signal_handling.py in handler(signum, frame)
64 # This following call uses waitid with WNOHANG from C side. Therefore,
65 # Python can still get and update the process status successfully.
—> 66 _error_if_any_worker_fails()
67 if previous_handler is not None:
68 previous_handler(signum, frame)

RuntimeError: DataLoader worker (pid 4378) is killed by signal: Killed.

Blink · October 24, 2020, 9:12am

I was getting that same error. Turns out it was because I ran my notebook on a cpu instance instead of a gpu instance. If you’re running this on paperspace gradient try switching to a gpu instance

TheNewGuy · October 25, 2020, 10:38am

Thanks for the suggestion
I checked am I’m definitely running a GPU

TheNewGuy · October 25, 2020, 10:40am

This did start working but I have no idea why

JonathanSum · October 25, 2020, 1:35pm

Hi, If you really have no one can help you, you can just share the link to the notebook here. I can take a look or possibly debug it in my free time if you really just follow the step from lesson 1 on Colab.

Here is my guess. Your notebook is a little bit different than lesson1, and it costs error.
Two, your number of workers should be set in 0 or 1.

To check whether or not you are using a GPU, you can try to run the code !nvidia-smi and post the picture of the output here.

TheNewGuy · October 26, 2020, 7:43am

Unfortunately, I have no idea how to share a link to a notebook.
But this problem has not persisted and I cannot replicate it
Running !nvidia-smi verifies that I am running a GPU

Following steps from Lesson 1 won't train a model

epoch train_loss valid_loss time 0 3.513519 00:12

epoch train_loss valid_loss time
0 3.513519 00:12