Hi experts,
I hope that this is the correct subcategory to post this issue in. I’ve been trying to recreate this notebook https://github.com/n-waves/multifit/blob/master/notebooks/MLDoc-JA-multifit_fp16.ipynb
on my own data for multiclass text classification. I’ve skipped pretraining and went right ahead with loading the pretrained German Multifit model. Finetuning the model went smoothly, with my dataset split into train.csv, dev.csv and test.csv. Hereby, the first column contains the texts while the second column contains the labels.
I’m currently stuck at the next step of the notebook: exp.classifier.train_(seed=0)
with the following error message appearing:
TypeError: '<' not supported between instances of 'float' and 'str'
Full Traceback:
TypeError Traceback (most recent call last)
<ipython-input-45-78541ce330fe> in <module>
----> 1 exp.classifier.train_(seed=0)
/opt/anaconda3/lib/python3.7/site-packages/multifit/training.py in train_(self, dataset_or_path, **train_config)
403 base_tokenizer = self.base.tokenizer
404 dataset = self._set_dataset_(dataset_or_path, base_tokenizer)
--> 405 data_clas = dataset.load_clas_databunch(bs=self.bs)
406 learn = self.get_learner(data_clas=data_clas)
407 print(f"Training: {learn.path / learn.model_dir}")
/opt/anaconda3/lib/python3.7/site-packages/multifit/datasets/dataset.py in load_clas_databunch(self, bs)
234
235 args = dict(vocab=vocab, bunch_class=TextClasDataBunch, bs=bs)
--> 236 data_cls = self.load_n_cache_databunch(cls_name, data_loader=lambda: self.load_supervised_data()[:2], **args)
237 # Hack to load test dataset with labels
238 data_tst = self.load_n_cache_databunch('tst', data_loader=lambda: self.load_supervised_data()[1:], **args)
/opt/anaconda3/lib/python3.7/site-packages/multifit/datasets/dataset.py in load_n_cache_databunch(self, name, bunch_class, data_loader, bs, **args)
252 print(f"Running tokenization: '{name}' ...")
253 train_df, valid_df = data_loader()
--> 254 databunch = self.databunch_from_df(bunch_class, train_df, valid_df, **args)
255 databunch.save(name)
256 print(f"Data {name}, trn: {len(databunch.train_ds)}, val: {len(databunch.valid_ds)}")
/opt/anaconda3/lib/python3.7/site-packages/multifit/datasets/dataset.py in databunch_from_df(self, bunch_class, train_df, valid_df, **args)
265 mark_fields=True,
266 text_cols=list(train_df.columns.values)[1:],
--> 267 **args)
268 return databunch
269
/opt/anaconda3/lib/python3.7/site-packages/fastai_contrib/text_data.py in make_data_bunch_from_df(cls, path, train_df, valid_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, chunksize, max_vocab, min_freq, mark_fields, include_bos, include_eos, processor, **kwargs)
151 src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)
152 else:
--> 153 src = src.label_from_df(cols=label_cols, classes=classes)
154 return src.databunch(**kwargs)
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
478 self.valid = fv(*args, from_item_lists=True, **kwargs)
479 self.__class__ = LabelLists
--> 480 self.process()
481 return self
482 return _inner
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self)
532 "Process the inner datasets."
533 xp,yp = self.get_processors()
--> 534 for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
535 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
536 for ds in self.lists:
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self, xp, yp, name, max_warn_items)
698 def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None, max_warn_items:int=5):
699 "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
--> 700 self.y.process(yp)
701 if getattr(self.y, 'filter_missing_y', False):
702 filt = array([o is None for o in self.y.items])
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self, processor)
82 if processor is not None: self.processor = processor
83 self.processor = listify(self.processor)
---> 84 for p in self.processor: p.process(self)
85 return self
86
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in process(self, ds)
346
347 def process(self, ds):
--> 348 if self.classes is None: self.create_classes(self.generate_classes(ds.items))
349 ds.classes = self.classes
350 ds.c2i = self.c2i
/opt/anaconda3/lib/python3.7/site-packages/fastai/data_block.py in generate_classes(self, items)
337 def generate_classes(self, items):
338 "Generate classes from `items` by taking the sorted unique values."
--> 339 return uniqueify(items, sort=True)
340
341 def process_one(self,item):
/opt/anaconda3/lib/python3.7/site-packages/fastai/core.py in uniqueify(x, sort)
101 "Return sorted unique values of `x`."
102 res = list(OrderedDict.fromkeys(x).keys())
--> 103 if sort: res.sort()
104 return res
105
TypeError: '<' not supported between instances of 'float' and 'str'
I looked at the error message and the code, trying to figure out where the mistake in my data is.
So far I’ve tried:
- Converting the labels in my .csv files to type “int” to be 100% sure that they are no other type than that
- Switching the columns and making the first column contain the labels and the second column contain the texts (as it seems like the dataset.py module wants to have the text in the second column)
Any ideas on where I went wrong? I find it odd that exp.finetune_lm.train_('train')
works perfectly fine with how I saved the data and the next step throws an error. Also, I’m a bit confused as to why dataset.py appears to require the first column to be the label column.
Let me know if you need more information on the error.
Many thanks!