List index out of range with SaveModelCallback

Ir1d · May 1, 2019, 3:11pm

learn1 = Learner(data, Fish, opt_func=optim.SGD, loss_func=FishLoss, callback_fns=[partial(EarlyStoppingCallback, monitor='val_loss', min_delta=0.1, patience=3)])
learn1.load(SAVE_PATH+'stage1')
learn1.unfreeze()

rn50_stage2_callbacks = [CSVLogger(learn=learn1, filename=SAVE_PATH+'stage2-history'), ShowGraph(learn=learn1), SaveModelCallback(learn1, every='improvement', monitor='val_loss', name='best')]
lr = (1e-3)/2
learn1.fit_one_cycle(10, slice(lr), callbacks=rn50_stage2_callbacks)

error:

IndexError                                Traceback (most recent call last)
<ipython-input-30-4a6e9abf40fc> in <module>
      1 rn50_stage2_callbacks = [CSVLogger(learn=learn1, filename=SAVE_PATH+'stage2-history'), ShowGraph(learn=learn1), SaveModelCallback(learn1, every='improvement', monitor='val_loss', name='best')]
      2 lr = (1e-5)/2
----> 3 learn1.fit_one_cycle(10, slice(lr), callbacks=rn50_stage2_callbacks)

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
     20     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
     21                                        final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 22     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     23 
     24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    195         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    196         if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
--> 197         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
    198 
    199     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    106                                        cb_handler=cb_handler, pbar=pbar)
    107             else: val_loss=None
--> 108             if cb_handler.on_epoch_end(val_loss): break
    109     except Exception as e:
    110         exception = e

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callback.py in on_epoch_end(self, val_loss)
    314         "Epoch is done, process `val_loss`."
    315         self.state_dict['last_metrics'] = [val_loss] if val_loss is not None else [None]
--> 316         self('epoch_end', call_mets = val_loss is not None)
    317         self.state_dict['epoch'] += 1
    318         return self.state_dict['stop_training']

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
    248         if call_mets:
    249             for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
--> 250         for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
    251 
    252     def set_dl(self, dl:DataLoader):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callback.py in _call_and_update(self, cb, cb_name, **kwargs)
    238     def _call_and_update(self, cb, cb_name, **kwargs)->None:
    239         "Call `cb_name` on `cb` and update the inner state."
--> 240         new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
    241         for k,v in new.items():
    242             if k not in self.state_dict:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callbacks/tracker.py in on_epoch_end(self, epoch, **kwargs)
     65     def on_epoch_end(self, epoch, **kwargs:Any)->None:
     66         "Compare the value monitored to its best score and maybe stop training."
---> 67         current = self.get_monitor_value()
     68         if current is None: return
     69         if self.operator(current - self.min_delta, self.best):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callbacks/tracker.py in get_monitor_value(self)
     44                   'val_loss':self.learn.recorder.val_losses[-1]}
     45         if values['val_loss'] is None: return
---> 46         for m, n in zip(self.learn.recorder.metrics[-1],self.learn.recorder.names[3:-1]):
     47             values[n] = m
     48         if values.get(self.monitor) is None:

IndexError: list index out of range

It fails after the validation phase of the first epoch.
I didn’t update my local package so I’m still using val_loss as the key at the moment.

Ir1d · May 1, 2019, 3:16pm

I guess the problem here is that I didn’t specify a metric, so self.learn.recorder.metrics[-1] goes wrong?
But why do I need to add a metric into the Leaner? and why is metrics used here in the trackerCallback?

In my case, I don’t really need a metrics, I have my loss modules, I wonder how to fix to use monitors on val_loss. Thx in advance.

sgugger · May 1, 2019, 6:37pm

This bug has been fixed a while ago, check you have the latest version of fastai!

Ir1d · May 1, 2019, 6:39pm

Thank you, I’ll install from source… currently using v1.0.51

sgugger · May 1, 2019, 6:40pm

It should be in v1.0.52

Ir1d · May 2, 2019, 3:11am

Hi, sgugger, I’ve updated to 1.0.52 and resulted new errors.

AttributeError                            Traceback (most recent call last)
<ipython-input-13-104e6d8eff7f> in <module>
      1 rn50_stage2_callbacks = [CSVLogger(learn=learn1, filename=SAVE_PATH+'stage2-history'), ShowGraph(learn=learn1), SaveModelCallback(learn1, every='improvement', monitor='val_loss', name='best')]
      2 lr = (1e-3)/2
----> 3 learn1.fit_one_cycle(10, slice(lr), callbacks=rn50_stage2_callbacks)

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
     20     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
     21                                        final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 22     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     23 
     24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    197         callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
    198         if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
--> 199         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
    200 
    201     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    104             if not cb_handler.skip_validate and not learn.data.empty_val:
    105                 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
--> 106                                        cb_handler=cb_handler, pbar=pbar)
    107             else: val_loss=None
    108             if cb_handler.on_epoch_end(val_loss): break

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
     60             val_losses.append(val_loss)
     61             if not is_listy(yb): yb = [yb]
---> 62             nums.append(yb[0].shape[0])
     63             if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
     64             if n_batch and (len(nums)>=n_batch): break

AttributeError: 'list' object has no attribute 'shape'

data = dataa.databunch(path='.', bs=2, num_workers=8, collate_fn=bb_pad_collate)
def tfmmm(batch):
    batch[1][0] = [(x + 1) * 0.5 for x in batch[1][0]]
    batch[1][0] = [torch.cat((x, torch.ones((x.shape[0], 1)).float().cuda()), 1) for x in batch[1][0]]
    batch[1][0] = [torch.stack([torch.zeros([5]).float().cuda() if (x == torch.tensor([0.5000, 0.5000, 0.5000, 0.5000, 1.0000]).cuda()).sum() == 5 else x for x in y]).float().cuda() for y in batch[1][0]]
    # batch[1][0] = torch.stack(batch[1][0]) 
    return batch
data.add_tfm(tfmmm)

I wonder what is expected here, I’m using bb_pad_collate, and the yb in my batch is something like

[[tensor([[0.2063, 0.4563, 0.2094, 0.4594, 1.0000],
           [0.2141, 0.4563, 0.2172, 0.4609, 1.0000],
           [0.1984, 0.4563, 0.2016, 0.4594, 1.0000],
           [0.8188, 0.4266, 0.8313, 0.4391, 1.0000]]),
   tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
           [0.4844, 0.6062, 0.5359, 0.6609, 1.0000],
           [0.6047, 0.7016, 0.6281, 0.7266, 1.0000]])],
  tensor([[1, 1, 1, 1],
          [0, 0, 1, 1]])]

and yb[0] corresponds to a list of tensors i guess. I wonder what’s happening here and how to solve it.
I think it’s expecting the batch[1][0] = torch.stack(batch[1][0]), and I’ve added it and started the experiment.
Btw, is there a way to skip the training process and directly debug the validation process? because my training takes 6 hours each epoch and is really time consuming to find out the code fails after validation. Thx in advance.

Ir1d · May 2, 2019, 11:54am

It seems that batch[1][0] = torch.stack(batch[1][0]) is what’s needed, I see that it has come to the second epoch
Thank you @sgugger !
I wonder if it is possible to skip the training process and directly debug the validation in fastai?

sgugger · May 2, 2019, 1:07pm

You can call learn.validate() before fitting if you want to make sure it works (results won’t be good of course before training).