I’m going through the NLP documentation at http://docs.fast.ai/text.html, and I’m hoping to fine-tune pre-trained embeddings to a small custom dataset I have. I’ve replaced the IMDB data with my own dataset to generate data_lm
and data_clas
. My training data has about 60 examples, each with 100-200 words, and with a binary response variable, which is imbalanced.
I’ve gotten the first several code blocks to run, but when I run learn.fit_one_cycle(1, 1e-2)
I get a ZeroDivisionError
. I see one other post here encountered a similar issue, but in that case, adjusting the number of epochs caused the error to go away. I’ve tried epoch values from 1-100, and I’ve played around with the second parameter too, to no avail.
Any advice on how to continue troubleshooting from here?
Here’s the traceback:
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
<ipython-input-114-a07394634999> in <module>()
1 learn = RNNLearner.language_model(data_lm, pretrained_fnames=['lstm_wt103', 'itos_wt103'], drop_mult=0.5)
----> 2 learn.fit_one_cycle(10, 1e-2)
~/anaconda3/lib/python3.6/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, wd, **kwargs)
16 cbs = [OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor,
17 pct_start=pct_start, **kwargs)]
---> 18 learn.fit(cyc_len, max_lr, wd=wd, callbacks=cbs)
19
20 def lr_find(learn:Learner, start_lr:float=1e-5, end_lr:float=10, num_it:int=100, **kwargs:Any):
~/anaconda3/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
131 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
132 fit(epochs, self.model, self.loss_fn, opt=self.opt, data=self.data, metrics=self.metrics,
--> 133 callbacks=self.callbacks+callbacks)
134
135 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
~/anaconda3/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, model, loss_fn, opt, data, callbacks, metrics)
84 except Exception as e:
85 exception = e
---> 86 raise e
87 finally: cb_handler.on_train_end(exception)
88
~/anaconda3/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, model, loss_fn, opt, data, callbacks, metrics)
75 if hasattr(data,'valid_dl') and data.valid_dl is not None:
76 *val_metrics,nums = validate(model, data.valid_dl, loss_fn=loss_fn,
---> 77 cb_handler=cb_handler, metrics=metrics,pbar=pbar)
78 nums = np.array(nums, dtype=np.float32)
79 val_metrics = [(to_np(torch.stack(val)) * nums).sum() / nums.sum()
~/anaconda3/lib/python3.6/site-packages/fastai/basic_train.py in validate(model, dl, loss_fn, metrics, cb_handler, pbar)
44 with torch.no_grad():
45 return zip(*[loss_batch(model, xb, yb, loss_fn, cb_handler=cb_handler, metrics=metrics)
---> 46 for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None))])
47
48 def train_epoch(model:Model, dl:DataLoader, opt:optim.Optimizer, loss_func:LossFunction)->None:
~/anaconda3/lib/python3.6/site-packages/fastprogress/fastprogress.py in __init__(self, gen, total, display, leave, parent, auto_update)
108 self.progress,self.text = IntProgress(min=0, max=len(gen) if total is None else total), HTML()
109 self.box = HBox([self.progress, self.text])
--> 110 super().__init__(gen, total, display, leave, parent, auto_update)
111
112 def on_iter_begin(self):
~/anaconda3/lib/python3.6/site-packages/fastprogress/fastprogress.py in __init__(self, gen, total, display, leave, parent, auto_update)
48 self.comment = ''
49 self.on_iter_begin()
---> 50 self.update(0)
51
52 def on_iter_begin(self): pass
~/anaconda3/lib/python3.6/site-packages/fastprogress/fastprogress.py in update(self, val)
72 self.pred_t = 0
73 self.last_v,self.wait_for = 0,1
---> 74 self.update_bar(0)
75 elif val >= self.last_v + self.wait_for or val == self.total:
76 cur_t = time()
~/anaconda3/lib/python3.6/site-packages/fastprogress/fastprogress.py in update_bar(self, val)
86 elapsed_t = format_time(elapsed_t)
87 end = '' if len(self.comment) == 0 else f' {self.comment}'
---> 88 self.on_update(val, f'{100 * val/self.total:.2f}% [{val}/{self.total} {elapsed_t}<{remaining_t}{end}]')
89
90
ZeroDivisionError: division by zero