Hi guys!
I am training a metric learning model with MSLoss and the training happens to be fine until the 4th epoch and every time in the 5th epoch, this error is encountered.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-24-6372abdb1c94> in <module>
1 # Run for 5 epochs with model frozen except last layer
2 learn.freeze_to(-1);
----> 3 learn.fit_one_cycle(5, 1e-2)
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
110 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
111 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
113
114 # Cell
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
209 self.opt.set_hypers(lr=self.lr if lr is None else lr)
210 self.n_epoch = n_epoch
--> 211 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
212
213 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _do_fit(self)
200 for epoch in range(self.n_epoch):
201 self.epoch=epoch
--> 202 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
203
204 def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _do_epoch(self)
194
195 def _do_epoch(self):
--> 196 self._do_epoch_train()
197 self._do_epoch_validate()
198
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _do_epoch_train(self)
186 def _do_epoch_train(self):
187 self.dl = self.dls.train
--> 188 self._with_events(self.all_batches, 'train', CancelTrainException)
189
190 def _do_epoch_validate(self, ds_idx=1, dl=None):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in all_batches(self)
164 def all_batches(self):
165 self.n_iter = len(self.dl)
--> 166 for o in enumerate(self.dl): self.one_batch(*o)
167
168 def _do_one_batch(self):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in one_batch(self, i, b)
182 self.iter = i
183 self._split(b)
--> 184 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
185
186 def _do_epoch_train(self):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in __call__(self, event_name)
139
140 def ordered_cbs(self, event): return [cb for cb in self.cbs.sorted('order') if hasattr(cb, event)]
--> 141 def __call__(self, event_name): L(event_name).map(self._call_one)
142
143 def _call_one(self, event_name):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
152 def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
153
--> 154 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
155 def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
156 def filter(self, f=noop, negate=False, gen=False, **kwargs):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
664 res = map(g, iterable)
665 if gen: return res
--> 666 return list(res)
667
668 # Cell
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
649 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
650 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 651 return self.func(*fargs, **kwargs)
652
653 # Cell
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/learner.py in _call_one(self, event_name)
143 def _call_one(self, event_name):
144 if not hasattr(event, event_name): raise Exception(f'missing {event_name}')
--> 145 for cb in self.cbs.sorted('order'): cb(event_name)
146
147 def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/core.py in __call__(self, event_name)
42 (self.run_valid and not getattr(self, 'training', False)))
43 res = None
---> 44 if self.run and _run: res = getattr(self, event_name, noop)()
45 if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
46 return res
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in before_batch(self)
84 def __init__(self, scheds): self.scheds = scheds
85 def before_fit(self): self.hps = {p:[] for p in self.scheds.keys()}
---> 86 def before_batch(self): self._update_val(self.pct_train)
87
88 def _update_val(self, pct):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in _update_val(self, pct)
87
88 def _update_val(self, pct):
---> 89 for n,f in self.scheds.items(): self.opt.set_hyper(n, f(pct))
90
91 def after_batch(self):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/fastai/callback/schedule.py in _inner(pos)
67 if int(pos) == 1: return scheds[-1](1.)
68 idx = (pos >= pcts).nonzero().max()
---> 69 actual_pos = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
70 return scheds[idx](actual_pos.item())
71 return _inner
IndexError: index 3 is out of bounds for dimension 0 with size 3
@jeremy, you had mentioned in this post that this error has been resolved.
I use the latest verison i.e. 2.2.7 in a sagemaker notebook instance environment yet this issue seems to come.
The problem I am not able to figure out is how does it run perfectly fine for 4 epochs and in the 5th epoch it gives this issue. It means there shouldn’t be any problem in data or sampling, right?
Could anyone please help on how to resolve this issue?
Thanks & Regards,
Vinayak.