I also wanted to clean the (training + validation) datasets.
By following @lesscomfortable’s recommendation, what i did is:
db = (ImageItemList.from_folder(path)
.no_split()
.label_from_folder()
.databunch())
learn = create_cnn(db, models.resnet34, metrics=error_rate)
learn.load('stage-2')
ds, idxs = DatasetFormatter().from_toplosses(learn)
But it ended up with error as below:
RuntimeError Traceback (most recent call last)
<ipython-input-20-59787071ac27> in <module>
6 learn = create_cnn(db, models.resnet34, metrics=error_rate)
7 learn.load('stage-2')
----> 8 ds, idxs = DatasetFormatter().from_toplosses(learn)
/opt/anaconda3/lib/python3.7/site-packages/fastai/widgets/image_cleaner.py in from_toplosses(cls, learn, n_imgs, **kwargs)
17 def from_toplosses(cls, learn, n_imgs=None, **kwargs):
18 "Gets indices with top losses."
---> 19 train_ds, train_idxs = cls.get_toplosses_idxs(learn, n_imgs, **kwargs)
20 return train_ds, train_idxs
21
/opt/anaconda3/lib/python3.7/site-packages/fastai/widgets/image_cleaner.py in get_toplosses_idxs(cls, learn, n_imgs, **kwargs)
25 dl = learn.data.fix_dl
26 if not n_imgs: n_imgs = len(dl.dataset)
---> 27 _,_,top_losses = learn.get_preds(ds_type=DatasetType.Fix, with_loss=True)
28 idxs = torch.topk(top_losses, n_imgs)[1]
29 return cls.padded_ds(dl.dataset, **kwargs), idxs
/opt/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in get_preds(self, ds_type, with_loss, n_batch, pbar)
253 lf = self.loss_func if with_loss else None
254 return get_preds(self.model, self.dl(ds_type), cb_handler=CallbackHandler(self.callbacks),
--> 255 activ=_loss_func2activ(self.loss_func), loss_func=lf, n_batch=n_batch, pbar=pbar)
256
257 def pred_batch(self, ds_type:DatasetType=DatasetType.Valid, batch:Tuple=None, reconstruct:bool=False) -> List[Tensor]:
/opt/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in get_preds(model, dl, pbar, cb_handler, activ, loss_func, n_batch)
38 "Tuple of predictions and targets, and optional losses (if `loss_func`) using `dl`, max batches `n_batch`."
39 res = [torch.cat(o).cpu() for o in
---> 40 zip(*validate(model, dl, cb_handler=cb_handler, pbar=pbar, average=False, n_batch=n_batch))]
41 if loss_func is not None: res.append(calc_loss(res[0], res[1], loss_func))
42 if activ is not None: res[0] = activ(res[0])
/opt/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
50 val_losses,nums = [],[]
51 if cb_handler: cb_handler.set_dl(dl)
---> 52 for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):
53 if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)
54 val_losses.append(loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler))
/opt/anaconda3/lib/python3.7/site-packages/fastprogress/fastprogress.py in __iter__(self)
63 self.update(0)
64 try:
---> 65 for i,o in enumerate(self._gen):
66 yield o
67 if self.auto_update: self.update(i+1)
/opt/anaconda3/lib/python3.7/site-packages/fastai/basic_data.py in __iter__(self)
69 def __iter__(self):
70 "Process and returns items from `DataLoader`."
---> 71 for b in self.dl: yield self.proc_batch(b)
72
73 @classmethod
/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
635 self.reorder_dict[idx] = batch
636 continue
--> 637 return self._process_next_batch(batch)
638
639 next = __next__ # Python 2 compatibility
/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _process_next_batch(self, batch)
656 self._put_indices()
657 if isinstance(batch, ExceptionWrapper):
--> 658 raise batch.exc_type(batch.exc_msg)
659 return batch
660
RuntimeError: Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/opt/anaconda3/lib/python3.7/site-packages/fastai/torch_core.py", line 110, in data_collate
return torch.utils.data.dataloader.default_collate(to_data(batch))
File "/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 232, in default_collate
return [default_collate(samples) for samples in transposed]
File "/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 232, in <listcomp>
return [default_collate(samples) for samples in transposed]
File "/opt/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 209, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 333 and 282 in dimension 2 at /opt/conda/conda-bld/pytorch_1544202130060/work/aten/src/TH/generic/THTensorMoreMath.cpp:1333
look like it is looking for the losses
when calling DatasetFormatter().from_toplosses(learn)
, but actually I don’t have it because I only load the model’s weights, i didn’t run fit_one_cycle()
.
Can you please help here.