List index out of range with SaveModelCallback

learn1 = Learner(data, Fish, opt_func=optim.SGD, loss_func=FishLoss, callback_fns=[partial(EarlyStoppingCallback, monitor='val_loss', min_delta=0.1, patience=3)])
learn1.load(SAVE_PATH+'stage1')
learn1.unfreeze()

rn50_stage2_callbacks = [CSVLogger(learn=learn1, filename=SAVE_PATH+'stage2-history'), ShowGraph(learn=learn1), SaveModelCallback(learn1, every='improvement', monitor='val_loss', name='best')]
lr = (1e-3)/2
learn1.fit_one_cycle(10, slice(lr), callbacks=rn50_stage2_callbacks)

error:

IndexError                                Traceback (most recent call last)
<ipython-input-30-4a6e9abf40fc> in <module>
      1 rn50_stage2_callbacks = [CSVLogger(learn=learn1, filename=SAVE_PATH+'stage2-history'), ShowGraph(learn=learn1), SaveModelCallback(learn1, every='improvement', monitor='val_loss', name='best')]
      2 lr = (1e-5)/2
----> 3 learn1.fit_one_cycle(10, slice(lr), callbacks=rn50_stage2_callbacks)

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
     20     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
     21                                        final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 22     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     23 
     24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    195         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    196         if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
--> 197         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
    198 
    199     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    106                                        cb_handler=cb_handler, pbar=pbar)
    107             else: val_loss=None
--> 108             if cb_handler.on_epoch_end(val_loss): break
    109     except Exception as e:
    110         exception = e

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callback.py in on_epoch_end(self, val_loss)
    314         "Epoch is done, process `val_loss`."
    315         self.state_dict['last_metrics'] = [val_loss] if val_loss is not None else [None]
--> 316         self('epoch_end', call_mets = val_loss is not None)
    317         self.state_dict['epoch'] += 1
    318         return self.state_dict['stop_training']

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
    248         if call_mets:
    249             for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
--> 250         for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
    251 
    252     def set_dl(self, dl:DataLoader):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callback.py in _call_and_update(self, cb, cb_name, **kwargs)
    238     def _call_and_update(self, cb, cb_name, **kwargs)->None:
    239         "Call `cb_name` on `cb` and update the inner state."
--> 240         new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
    241         for k,v in new.items():
    242             if k not in self.state_dict:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callbacks/tracker.py in on_epoch_end(self, epoch, **kwargs)
     65     def on_epoch_end(self, epoch, **kwargs:Any)->None:
     66         "Compare the value monitored to its best score and maybe stop training."
---> 67         current = self.get_monitor_value()
     68         if current is None: return
     69         if self.operator(current - self.min_delta, self.best):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/callbacks/tracker.py in get_monitor_value(self)
     44                   'val_loss':self.learn.recorder.val_losses[-1]}
     45         if values['val_loss'] is None: return
---> 46         for m, n in zip(self.learn.recorder.metrics[-1],self.learn.recorder.names[3:-1]):
     47             values[n] = m
     48         if values.get(self.monitor) is None:

IndexError: list index out of range

It fails after the validation phase of the first epoch.
I didn’t update my local package so I’m still using val_loss as the key at the moment.

I guess the problem here is that I didn’t specify a metric, so self.learn.recorder.metrics[-1] goes wrong?
But why do I need to add a metric into the Leaner? and why is metrics used here in the trackerCallback?

In my case, I don’t really need a metrics, I have my loss modules, I wonder how to fix to use monitors on val_loss. Thx in advance.

This bug has been fixed a while ago, check you have the latest version of fastai!

Thank you, I’ll install from source… currently using v1.0.51

It should be in v1.0.52

1 Like

Hi, sgugger, I’ve updated to 1.0.52 and resulted new errors.

AttributeError                            Traceback (most recent call last)
<ipython-input-13-104e6d8eff7f> in <module>
      1 rn50_stage2_callbacks = [CSVLogger(learn=learn1, filename=SAVE_PATH+'stage2-history'), ShowGraph(learn=learn1), SaveModelCallback(learn1, every='improvement', monitor='val_loss', name='best')]
      2 lr = (1e-3)/2
----> 3 learn1.fit_one_cycle(10, slice(lr), callbacks=rn50_stage2_callbacks)

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
     20     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
     21                                        final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 22     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     23 
     24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    197         callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
    198         if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
--> 199         fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
    200 
    201     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
    104             if not cb_handler.skip_validate and not learn.data.empty_val:
    105                 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
--> 106                                        cb_handler=cb_handler, pbar=pbar)
    107             else: val_loss=None
    108             if cb_handler.on_epoch_end(val_loss): break

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
     60             val_losses.append(val_loss)
     61             if not is_listy(yb): yb = [yb]
---> 62             nums.append(yb[0].shape[0])
     63             if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
     64             if n_batch and (len(nums)>=n_batch): break

AttributeError: 'list' object has no attribute 'shape'
data = dataa.databunch(path='.', bs=2, num_workers=8, collate_fn=bb_pad_collate)
def tfmmm(batch):
    batch[1][0] = [(x + 1) * 0.5 for x in batch[1][0]]
    batch[1][0] = [torch.cat((x, torch.ones((x.shape[0], 1)).float().cuda()), 1) for x in batch[1][0]]
    batch[1][0] = [torch.stack([torch.zeros([5]).float().cuda() if (x == torch.tensor([0.5000, 0.5000, 0.5000, 0.5000, 1.0000]).cuda()).sum() == 5 else x for x in y]).float().cuda() for y in batch[1][0]]
    # batch[1][0] = torch.stack(batch[1][0]) 
    return batch
data.add_tfm(tfmmm)

I wonder what is expected here, I’m using bb_pad_collate, and the yb in my batch is something like

[[tensor([[0.2063, 0.4563, 0.2094, 0.4594, 1.0000],
           [0.2141, 0.4563, 0.2172, 0.4609, 1.0000],
           [0.1984, 0.4563, 0.2016, 0.4594, 1.0000],
           [0.8188, 0.4266, 0.8313, 0.4391, 1.0000]]),
   tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
           [0.4844, 0.6062, 0.5359, 0.6609, 1.0000],
           [0.6047, 0.7016, 0.6281, 0.7266, 1.0000]])],
  tensor([[1, 1, 1, 1],
          [0, 0, 1, 1]])]

and yb[0] corresponds to a list of tensors i guess. I wonder what’s happening here and how to solve it.
I think it’s expecting the batch[1][0] = torch.stack(batch[1][0]), and I’ve added it and started the experiment.
Btw, is there a way to skip the training process and directly debug the validation process? because my training takes 6 hours each epoch and is really time consuming to find out the code fails after validation. Thx in advance.

It seems that batch[1][0] = torch.stack(batch[1][0]) is what’s needed, I see that it has come to the second epoch :smile:
Thank you @sgugger !
I wonder if it is possible to skip the training process and directly debug the validation in fastai?

You can call learn.validate() before fitting if you want to make sure it works (results won’t be good of course before training).

2 Likes