ImageDataBunch.from_df giving KeyError

I have a suspicion that what is happening is that there are not enough samples per label to effectively split between train and valid, but I wasn’t able to locate the source for the random_split_by_pct function yet. Could anyone confirm if it requires a certain sample size per label?

Full error stack:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process_one(self, item)
    277     def process_one(self,item):
--> 278         try: return self.c2i[item] if item is not None else None
    279         except:

KeyError: 'w_ac2df91'

During handling of the above exception, another exception occurred:

Exception                                 Traceback (most recent call last)
<ipython-input-10-18e461a1ec4f> in <module>
      1 path = './data'
----> 2 data = ImageDataBunch.from_df(path, df, ds_tfms=get_transforms(), size=224, num_workers=0).normalize(imagenet_stats)
      3 data.show_batch(rows=4)

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\vision\data.py in from_df(cls, path, df, folder, sep, valid_pct, fn_col, label_col, suffix, **kwargs)
    123         src = (ImageItemList.from_df(df, path=path, folder=folder, suffix=suffix, cols=fn_col)
    124                 .random_split_by_pct(valid_pct)
--> 125                 .label_from_df(sep=sep, cols=label_col))
    126         return cls.create_from_ll(src, **kwargs)
    127 

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in _inner(*args, **kwargs)
    391             self.valid = fv(*args, **kwargs)
    392             self.__class__ = LabelLists
--> 393             self.process()
    394             return self
    395         return _inner

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process(self)
    438         "Process the inner datasets."
    439         xp,yp = self.get_processors()
--> 440         for i,ds in enumerate(self.lists): ds.process(xp, yp, filter_missing_y=i==0)
    441         return self
    442 

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process(self, xp, yp, filter_missing_y)
    563     def process(self, xp=None, yp=None, filter_missing_y:bool=False):
    564         "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
--> 565         self.y.process(yp)
    566         if filter_missing_y and (getattr(self.x, 'filter_missing_y', None)):
    567             filt = array([o is None for o in self.y])

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process(self, processor)
     66         if processor is not None: self.processor = processor
     67         self.processor = listify(self.processor)
---> 68         for p in self.processor: p.process(self)
     69         return self
     70 

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process(self, ds)
    284         ds.classes = self.classes
    285         ds.c2i = self.c2i
--> 286         super().process(ds)
    287 
    288     def __getstate__(self): return {'classes':self.classes}

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process(self, ds)
     36     def __init__(self, ds:Collection=None):  self.ref_ds = ds
     37     def process_one(self, item:Any):         return item
---> 38     def process(self, ds:Collection):        ds.items = array([self.process_one(item) for item in ds.items])
     39 
     40 class ItemList():

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in <listcomp>(.0)
     36     def __init__(self, ds:Collection=None):  self.ref_ds = ds
     37     def process_one(self, item:Any):         return item
---> 38     def process(self, ds:Collection):        ds.items = array([self.process_one(item) for item in ds.items])
     39 
     40 class ItemList():

~\AppData\Local\conda\conda\envs\machine-learning\lib\site-packages\fastai\data_block.py in process_one(self, item)
    278         try: return self.c2i[item] if item is not None else None
    279         except:
--> 280             raise Exception("Your validation data contains a label that isn't present in the training set, please fix your data.")
    281 
    282     def process(self, ds):

Exception: Your validation data contains a label that isn't present in the training set, please fix your data.

Update for anyone interested, here is the function:

def random_split_by_pct(self, valid_pct:float=0.2, seed:int=None)->'ItemLists':
        "Split the items randomly by putting `valid_pct` in the validation set, optional `seed` can be passed."
        if valid_pct==0.: return self.no_split()
        if seed is not None: np.random.seed(seed)
        rand_idx = np.random.permutation(range_of(self))
        cut = int(valid_pct * len(self))
        return self.split_by_idx(rand_idx[:cut])

So it appears that this will indeed happen for any label with fewer than 5 samples(cut will be 0 unless self is >=5). I guess my next step is to somehow supplement this dataset.