Hello everyone,
I’m building a multi-label patent classification algorithm using both fastai v2 and transformers.
My issue is that when I try to create the dataloader I get an error. here is my code
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
#on crée l'objet données et on les sépare en training et validation
twipper_data = Twipper_data(training_dataset_path, training_column)
twipper_data.clean_data()
twipper_data.split_data()
#twipper_data.tokenize(tokenizer)
#https://walkwithfastai.com/nlp.external.transformers-glue#DataBlock-and-Transforms
dls_kwargs = {
'before_batch': TokBatchTransform(tokenizer = tokenizer, max_length=72),
'create_batch': fa_convert
}
text_block = TransformBlock(dl_type=SortedDL, dls_kwargs=dls_kwargs, batch_tfms=Undict(), )
#on crée le datablock
datablock = DataBlock(blocks = [text_block, CategoryBlock()],
get_x=ColReader(twipper_data.input_col),
get_y=ColReader('_Labels', label_delim=' '),
splitter=ColSplitter())
#print(datablock.summary(source = twipper_data.get_dataframe() ))
#on crée le dataloader
batch_size = 32
twipper_data.clean_data()
df = twipper_data.get_dataframe()
#print(df.columns)
dataloaders = datablock.dataloaders(source = twipper_data.get_dataframe(),
bs = batch_size,
seed = 0)
and here is the error stack:
TypeError Traceback (most recent call last)
Input In [2], in <cell line: 1>()
----> 1 trainer = trainer()
File ~/twipper/twipper_v3/src/governor.py:54, in trainer(model_name, training_column, checkpoint_path, save_path, option)
36 """
37 processus d'entrainement
38
(...)
47 finis par enregister le modèle à l'addresse prévue
48 """
50 #on charge le modèle et le tokenize associé
51 #AutoConfig/AutoTokenizer -> fonctions génériques pour pouvoir utiliser n'importe quel modèle sans avoir à importer les libraries spécifiques
52 #model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
53 #tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
---> 54 trainer = Trainer(model_name = "Roberta",
55 training_column = 'extrait',
56 checkpoint_path = "pretrained_models/roberta/large/",
57 save_path = "models/",
58 training_dataset_path = "data/02-2022_medium_training.csv")
File ~/twipper/twipper_v3/src/trainer.py:75, in Trainer.__init__(self, model_name, training_column, checkpoint_path, save_path, training_dataset_path, option)
73 df = twipper_data.get_dataframe()
74 #print(df.columns)
---> 75 dataloaders = datablock.dataloaders(source = twipper_data.get_dataframe(),
76 bs = batch_size,
77 seed = 0)
78 """dataloader = TextDataLoaders.from_df(df = twipper_data.get_dataframe(),
79 bs = batch_size,
80 text_col = training_column,
81 label_col = '_Labels',
82 label_delim=' ',
83 valid_pct = 0.5)"""
85 #on crée le learner
86 #objet qui va gérer le calcul du gradient, l'évolution des poids etc...
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/block.py:113, in DataBlock.dataloaders(self, source, path, verbose, **kwargs)
112 def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 113 dsets = self.datasets(source, verbose=verbose)
114 kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
115 return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/block.py:110, in DataBlock.datasets(self, source, verbose)
108 splits = (self.splitter or RandomSplitter())(items)
109 pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 110 return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:334, in Datasets.__init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
332 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
333 super().__init__(dl_type=dl_type)
--> 334 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
335 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:334, in <listcomp>(.0)
332 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
333 super().__init__(dl_type=dl_type)
--> 334 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
335 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/foundation.py:97, in _L_Meta.__call__(cls, x, *args, **kwargs)
95 def __call__(cls, x=None, *args, **kwargs):
96 if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97 return super().__call__(x, *args, **kwargs)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:257, in TfmdLists.__init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
255 if do_setup:
256 pv(f"Setting up {self.tfms}", verbose)
--> 257 self.setup(train_setup=train_setup)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:276, in TfmdLists.setup(self, train_setup)
275 def setup(self, train_setup=True):
--> 276 self.tfms.setup(self, train_setup)
277 if len(self) != 0:
278 x = super().__getitem__(0) if self.splits is None else super().__getitem__(self.splits[0])[0]
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/transform.py:192, in Pipeline.setup(self, items, train_setup)
190 tfms = self.fs[:]
191 self.fs.clear()
--> 192 for t in tfms: self.add(t,items, train_setup)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/transform.py:196, in Pipeline.add(self, ts, items, train_setup)
194 def add(self,ts, items=None, train_setup=False):
195 if not is_listy(ts): ts=[ts]
--> 196 for t in ts: t.setup(items, train_setup)
197 self.fs+=ts
198 self.fs = self.fs.sorted(key='order')
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/transform.py:79, in Transform.setup(self, items, train_setup)
77 def setup(self, items=None, train_setup=False):
78 train_setup = train_setup if self.train_setup is None else self.train_setup
---> 79 return self.setups(getattr(items, 'train', items) if train_setup else items)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/dispatch.py:123, in TypeDispatch.__call__(self, *args, **kwargs)
121 elif self.inst is not None: f = MethodType(f, self.inst)
122 elif self.owner is not None: f = MethodType(f, self.owner)
--> 123 return f(*args, **kwargs)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/transforms.py:251, in Categorize.setups(self, dsets)
250 def setups(self, dsets):
--> 251 if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets, sort=self.sort, add_na=self.add_na)
252 self.c = len(self.vocab)
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/transforms.py:227, in CategoryMap.__init__(self, col, sort, add_na, strict)
225 if not hasattr(col,'unique'): col = L(col, use_list=True)
226 # `o==o` is the generalized definition of non-NaN used by Pandas
--> 227 items = L(o for o in col.unique() if o==o)
228 if sort: items = items.sorted()
229 self.items = '#na#' + items if add_na else items
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/foundation.py:163, in L.unique(self, sort, bidir, start)
--> 163 def unique(self, sort=False, bidir=False, start=None): return L(uniqueify(self, sort=sort, bidir=bidir, start=start))
File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/basics.py:662, in uniqueify(x, sort, bidir, start)
660 def uniqueify(x, sort=False, bidir=False, start=None):
661 "Unique elements in `x`, optional `sort`, optional return reverse correspondence, optional prepend with elements."
--> 662 res = list(dict.fromkeys(x))
663 if start is not None: res = listify(start)+res
664 if sort: res.sort()
TypeError: unhashable type: 'list'
My dataset is composed of a column “extrait” ( that’s the input text) and a column “_Labels” ( which is a string of labels seperated by a space)