Issue with WeightedDL

Hello,

I am trying to use a weighted dataloader for an imbalanced dataset with 8 classes.

My datablock (which works fine with previous dataloader/models):

test_size=0.3
splitter = TrainTestSplitter(test_size=test_size, random_state=42, stratify=train_set.iloc[:,1])

snd_blk = DataBlock(blocks=(ImageBlock, CategoryBlock),
               splitter=splitter,
               get_x=ColReader(0, pref=Exp_),
               get_y=ColReader(1),
               item_tfms=Resize(854))
               batch_tfms=aug_transforms())

and my Dataloader:

dls = snd_blk.dataloaders(train_set,num_workers=2, dl_type=WeightedDL ,wgts=wgts, bs=16)

my train_set (which also worked fine previously) is a dataframe with len: 5700 and my wgts is the same.

When I try.
dls.show_batch()

or
lr_min,lr_steep=learn.lr_find()

I get the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-113-d18828793d27> in <module>
----> 1 lr_min,lr_steep=learn.lr_find()
      2 print(f"Minimum/10: {lr_min:.2e}, steepest point: {lr_steep:.2e}")

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/callback/schedule.py in lr_find(self, start_lr, end_lr, num_it, stop_div, show_plot, suggestions)
    220     n_epoch = num_it//len(self.dls.train) + 1
    221     cb=LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
--> 222     with self.no_logging(): self.fit(n_epoch, cbs=cb)
    223     if show_plot: self.recorder.plot_lr_find()
    224     if suggestions:

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    210             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    211             self.n_epoch = n_epoch
--> 212             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    213 
    214     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    201         for epoch in range(self.n_epoch):
    202             self.epoch=epoch
--> 203             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    204 
    205     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    195 
    196     def _do_epoch(self):
--> 197         self._do_epoch_train()
    198         self._do_epoch_validate()
    199 

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    187     def _do_epoch_train(self):
    188         self.dl = self.dls.train
--> 189         self._with_events(self.all_batches, 'train', CancelTrainException)
    190 
    191     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    158 
    159     def _with_events(self, f, event_type, ex, final=noop):
--> 160         try: self(f'before_{event_type}');  f()
    161         except ex: self(f'after_cancel_{event_type}')
    162         self(f'after_{event_type}');  final()

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    164     def all_batches(self):
    165         self.n_iter = len(self.dl)
--> 166         for o in enumerate(self.dl): self.one_batch(*o)
    167 
    168     def _do_one_batch(self):

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/data/load.py in __iter__(self)
    106         self.randomize()
    107         self.before_iter()
--> 108         self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
    109         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
    110             # fix issue 2899. If the process start method isn't fork, the data will be copied to cuda in learner one_batch.

~/miniconda3/envs/fastai_V2/lib/python3.8/site-packages/fastai/callback/data.py in get_idxs(self)
     31         if self.n==0: return []
     32         if not self.shuffle: return super().get_idxs()
---> 33         return list(np.random.choice(self.n, self.n, p=self.wgts))
     34 
     35 # Cell

mtrand.pyx in numpy.random.mtrand.RandomState.choice()

ValueError: 'a' and 'p' must have same size

Any ideas as to where I am going wrong?

Cheers

I think the wgts in WeightedDL is applied only in the training set and not for your whole dataset (That make sense because you don’t need to oversampling your validation set).

You can, before creating your dataloader, trying to create a dataset to get access to your dataset.train, then calculate your wgts from there.

You can take a look at my notebook here Healthy Lung Classification Spectrogram Fast.ai | Kaggle at cell 15.

count = Counter(labels)
wgts = [1/count[dsets.vocab[label]] for img, label in dsets.train]
wgts[:10]

Hope it helps.

3 Likes

Hi Dien-Hoa :slight_smile: !
What values should be stored in labels?

Would it be [‘label_0’, ‘label_1’]?

If that’s the case I, this is not working for me.

Here’s my code:

dblock = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    get_items=get_image_files,
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=parent_label,
    item_tfms=item_tfms,
    batch_tfms=batch_tfms)

dblock.dataloaders(path_imgs, bs=bs)
dls = dblock.dataloaders(path_imgs, dl_type=WeightedDL, wgts=wgts, bs=16)

# dls_patches_one.vocab = ['nodule', 'normal']

dblock.summary(path_patches_one, bs=bs, show_batch=True, unique=True)
# Found 245 items. 2 datasets of sizes 196,49

count = Counter(['nodule', 'normal'])
[1/count[dls_patches_one.vocab[l]] for i, l in dls_patches_one.train_ds]
# [1.0,
# 1.0,
# 1.0,
# ....
# 1.0,
# 1.0,
# 1.0]

labels is the list of labels in your data. Example: [‘label_0’, ‘label_0’, ‘label_1’, ‘label_0’, …]