Dataset created with split_by_fname_file is failing during validation

Loki1725 · April 5, 2020, 7:25am

I am in the process of building a dataset for an object identification project and I’m training a model incrementally as we accumulate data. I’ve actualy used initial models to speed up data labeling by created generic classes that can increase the efficiency of our SME labelers.

In the past I’ve been using split_by_rand_pct(valid_pct=0.2) for my databunch. I now want to apply some SME expertise in selecting the split, mostly as I have some classes with a fairly small number of samples, and I wsant to make sure they get evenly distributed between training and validation.

I created a csv file that is a list of the desired validation filenames, and updated my code to use split_by_fname_file. The databunch created properly, and the output shows a Train and Valid LabelList, that had the correct number of samples in each based on the full dataset size, and file.

When I tried to train the model though, fit_one_cycle failed upon validation. The full error output is below with my code. I began investigating the dataset though and found one oddity.
data.show_batch(rows=rows, ds_type=‘Train’) and data.show_batch(rows=rows, ds_type=‘Valid’) both return the same images, with the same labels. The validation file list has no duplicates in it, so I’m not sure how this works.

Any suggestions on how to further debug this?
code:

train_images, train_lbl_bbox = get_annotations(datapath.as_posix()+’/annotations/6fpspore_master.json’)
img2bbox = dict(zip(train_images, train_lbl_bbox))
df = pd.DataFrame(data=train_images)
bs = 16
size=(180,256)
#data = get_data(16,(180,256))
src = ObjectItemList.from_df(df, datapath.as_posix(), folder=‘images’)
src = src.split_by_fname_file(datapath.as_posix()+’/validation_files.csv’)
#src = src.split_by_rand_pct(valid_pct=0.15)
src = src.label_from_func(get_y_func)
src = src.transform(get_transforms(), size=size, resize_method=ResizeMethod.SQUISH, tfm_y=True)
data=src.databunch(path=outputpath,bs=bs,collate_fn=bb_pad_collate)
data.normalize(imagenet_stats)

returns:
ImageDataBunch;

Train: LabelList (2603 items)
x: ObjectItemList
Image (3, 180, 256),Image (3, 180, 256),Image (3, 180, 256),Image (3, 180, 256),Image (3, 180, 256)
y: ObjectCategoryList
ImageBBox (180, 256),ImageBBox (180, 256),ImageBBox (180, 256),ImageBBox (180, 256),ImageBBox (180, 256)
Path: /home/loki/workspace/TrainingData/spore;

Valid: LabelList (1227 items)
x: ObjectItemList
Image (3, 180, 256),Image (3, 180, 256),Image (3, 180, 256),Image (3, 180, 256),Image (3, 180, 256)
y: ObjectCategoryList
ImageBBox (180, 256),ImageBBox (180, 256),ImageBBox (180, 256),ImageBBox (180, 256),ImageBBox (180, 256)
Path: /home/loki/workspace/TrainingData/spore;

Test: None

code:
learn.fit_one_cycle(10, slice(1e-4,8e-4))

returns:
TypeError Traceback (most recent call last)
in
----> 1 learn.fit_one_cycle(10, slice(1e-4,8e-4))

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
20 callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
21 final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
—> 22 learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
23
24 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
200 callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
201 self.cb_fns_registered = True
–> 202 fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
203
204 def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
104 if not cb_handler.skip_validate and not learn.data.empty_val:
105 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
–> 106 cb_handler=cb_handler, pbar=pbar)
107 else: val_loss=None
108 if cb_handler.on_epoch_end(val_loss): break

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
55 val_losses,nums = [],[]
56 if cb_handler: cb_handler.set_dl(dl)
—> 57 for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):
58 if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)
59 val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler)

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastprogress/fastprogress.py in iter(self)
70 self.update(0)
71 try:
—> 72 for i,o in enumerate(self._gen):
73 if i >= self.total: break
74 yield o

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/basic_data.py in iter(self)
73 def iter(self):
74 “Process and returns items from DataLoader.”
—> 75 for b in self.dl: yield self.proc_batch(b)
76
77 @classmethod

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self)
799 if len(self._task_info[self._rcvd_idx]) == 2:
800 data = self._task_info.pop(self._rcvd_idx)[1]
–> 801 return self._process_data(data)
802
803 assert not self._shutdown and self._tasks_outstanding > 0

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _process_data(self, data)
844 self._try_put_index()
845 if isinstance(data, ExceptionWrapper):
–> 846 data.reraise()
847 return data
848

~/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
383 # (https://bugs.python.org/issue2651), so we work around it.
384 msg = KeyErrorMessage(msg)
–> 385 raise self.exc_type(msg)

TypeError: Caught TypeError in DataLoader worker process 2.
Original Traceback (most recent call last):
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py”, line 178, in _worker_loop
data = fetcher.fetch(index)
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/data_block.py”, line 648, in getitem
if self.item is None: x,y = self.x[idxs],self.y[idxs]
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/data_block.py”, line 118, in getitem
if isinstance(idxs, Integral): return self.get(idxs)
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/vision/data.py”, line 355, in get
return ImageBBox.create(*_get_size(self.x,i), *self.items[i], classes=self.classes, pad_idx=self.pad_idx)
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/vision/image.py”, line 358, in create
return cls(flow, labels=labels, classes=classes, pad_idx=pad_idx, y_first=True, scale=scale)
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/vision/image.py”, line 340, in init
labels = array([Category(l,classes[l]) for l in labels])
File “/home/loki/Downloads/home/loki/workspace/anaconda3/envs/fastai/lib/python3.7/site-packages/fastai/vision/image.py”, line 340, in
labels = array([Category(l,classes[l]) for l in labels])
TypeError: list indices must be integers or slices, not NoneType

Loki1725 · April 5, 2020, 9:40pm

Quick update on the above. I found a typo in my data.show_batch code. I was passing it an inccorect value for ds_type. If I run data.show_batch(rows=2, ds_type=DatasetType.Train) and data.show_batch(rows=2, ds_type=DatasetType.Valid) I get different results, which I expect. Both batches show images with correct labeling.

This only makes me more confused as to why the validation set is not working correctly during training.