IndexError: index 3 is out of bounds for dimension 0 with size 3 Recurring Error

Hi guys!

I am training a metric learning model with MSLoss and the training happens to be fine until the 4th epoch and every time in the 5th epoch, this error is encountered.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-24-6372abdb1c94> in <module>
      1 # Run for 5 epochs with model frozen except last layer
      2 learn.freeze_to(-1);
----> 3 learn.fit_one_cycle(5, 1e-2)

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    110     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    111               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    113 
    114 # Cell

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    209             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    210             self.n_epoch = n_epoch
--> 211             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    212 
    213     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _do_fit(self)
    200         for epoch in range(self.n_epoch):
    201             self.epoch=epoch
--> 202             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    203 
    204     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _do_epoch(self)
    194 
    195     def _do_epoch(self):
--> 196         self._do_epoch_train()
    197         self._do_epoch_validate()
    198 

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _do_epoch_train(self)
    186     def _do_epoch_train(self):
    187         self.dl = self.dls.train
--> 188         self._with_events(self.all_batches, 'train', CancelTrainException)
    189 
    190     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in all_batches(self)
    164     def all_batches(self):
    165         self.n_iter = len(self.dl)
--> 166         for o in enumerate(self.dl): self.one_batch(*o)
    167 
    168     def _do_one_batch(self):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in one_batch(self, i, b)
    182         self.iter = i
    183         self._split(b)
--> 184         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    185 
    186     def _do_epoch_train(self):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in __call__(self, event_name)
    139 
    140     def ordered_cbs(self, event): return [cb for cb in self.cbs.sorted('order') if hasattr(cb, event)]
--> 141     def __call__(self, event_name): L(event_name).map(self._call_one)
    142 
    143     def _call_one(self, event_name):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
    152     def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
    153 
--> 154     def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
    155     def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
    156     def filter(self, f=noop, negate=False, gen=False, **kwargs):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
    664     res = map(g, iterable)
    665     if gen: return res
--> 666     return list(res)
    667 
    668 # Cell

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
    649             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    650         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 651         return self.func(*fargs, **kwargs)
    652 
    653 # Cell

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _call_one(self, event_name)
    143     def _call_one(self, event_name):
    144         if not hasattr(event, event_name): raise Exception(f'missing {event_name}')
--> 145         for cb in self.cbs.sorted('order'): cb(event_name)
    146 
    147     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/core.py in __call__(self, event_name)
     42                (self.run_valid and not getattr(self, 'training', False)))
     43         res = None
---> 44         if self.run and _run: res = getattr(self, event_name, noop)()
     45         if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
     46         return res

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in before_batch(self)
     84     def __init__(self, scheds): self.scheds = scheds
     85     def before_fit(self): self.hps = {p:[] for p in self.scheds.keys()}
---> 86     def before_batch(self): self._update_val(self.pct_train)
     87 
     88     def _update_val(self, pct):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in _update_val(self, pct)
     87 
     88     def _update_val(self, pct):
---> 89         for n,f in self.scheds.items(): self.opt.set_hyper(n, f(pct))
     90 
     91     def after_batch(self):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in _inner(pos)
     67         if int(pos) == 1: return scheds[-1](1.)
     68         idx = (pos >= pcts).nonzero().max()
---> 69         actual_pos = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
     70         return scheds[idx](actual_pos.item())
     71     return _inner

IndexError: index 3 is out of bounds for dimension 0 with size 3

@jeremy, you had mentioned in this post that this error has been resolved.

I use the latest verison i.e. 2.2.7 in a sagemaker notebook instance environment yet this issue seems to come.
image

The problem I am not able to figure out is how does it run perfectly fine for 4 epochs and in the 5th epoch it gives this issue. It means there shouldn’t be any problem in data or sampling, right?

Could anyone please help on how to resolve this issue?

Thanks & Regards,
Vinayak.

Hello guys!

This problem is happening in the combine_scheds routine in the schedule.py file. Here’s the definition of that function.

def combine_scheds(pcts, scheds):
    "Combine `scheds` according to `pcts` in one function"
    assert sum(pcts) == 1.
    pcts = tensor([0] + L(pcts))
    assert torch.all(pcts >= 0)
    pcts = torch.cumsum(pcts, 0)
    def _inner(pos):
        if int(pos) == 1: return scheds[-1](1.)
        idx = (pos >= pcts).nonzero().max()
#         print(pcts, pos, idx)
        actual_pos = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
        return scheds[idx](actual_pos.item())
    return _inner

Although I am not aware of how this function works when we are doing the training, I still tried to print out the variables pct, pos, idx which are defined locally in this function. If I show a few outputs of this print and the point where the error is coming

tensor([0.0000, 0.2500, 1.0000]) 0.8611111111111098 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.8888888888888875 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.8888888888888875 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9166666666666652 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9166666666666652 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9444444444444429 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9444444444444429 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9722222222222205 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9722222222222205 tensor(1)
tensor([0.0000, 0.2500, 1.0000]) 0.9999999999999982 tensor(2)

If you look at the final entry here, we can see that idx = tensor(2). Let’s look at how we get the idx and where the problem is happenning.

@jeremy, @sgugger, @muellerzr could you please help me with this issue? I am using a batch size of 21, 33, or any multiple of 3 because I was training for metric learning with multi-similarity loss with 3 classes and k images per class (7,11, anything) hence the batch size. Is it a must that the batch size must be a power of 2?

Thanks & Regards,
Vinayak.