Error Setting up Pipeline: unhashable type: 'list'

Hello everyone,
I’m building a multi-label patent classification algorithm using both fastai v2 and transformers.
My issue is that when I try to create the dataloader I get an error. here is my code

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
        tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

        #on crée l'objet données et on les sépare en training et validation
        twipper_data = Twipper_data(training_dataset_path, training_column)
        twipper_data.clean_data()
        twipper_data.split_data()

        #twipper_data.tokenize(tokenizer)
        
        #https://walkwithfastai.com/nlp.external.transformers-glue#DataBlock-and-Transforms
        dls_kwargs = {
            'before_batch': TokBatchTransform(tokenizer = tokenizer, max_length=72),
            'create_batch': fa_convert
        }
        text_block = TransformBlock(dl_type=SortedDL, dls_kwargs=dls_kwargs, batch_tfms=Undict(), )
        
        #on crée le datablock
        datablock = DataBlock(blocks = [text_block, CategoryBlock()],
                   get_x=ColReader(twipper_data.input_col),
                   get_y=ColReader('_Labels', label_delim=' '),
                   splitter=ColSplitter())
        
        #print(datablock.summary(source = twipper_data.get_dataframe() ))
        
        #on crée le dataloader
        batch_size = 32
        twipper_data.clean_data()
        df = twipper_data.get_dataframe()
        #print(df.columns)
        dataloaders = datablock.dataloaders(source = twipper_data.get_dataframe(),
                                            bs = batch_size,
                                            seed = 0)

and here is the error stack:

TypeError                                 Traceback (most recent call last)
Input In [2], in <cell line: 1>()
----> 1 trainer = trainer()

File ~/twipper/twipper_v3/src/governor.py:54, in trainer(model_name, training_column, checkpoint_path, save_path, option)
     36 """
     37 processus d'entrainement
     38 
   (...)
     47 finis par enregister le modèle à l'addresse prévue
     48 """
     50 #on charge le modèle et le tokenize associé
     51 #AutoConfig/AutoTokenizer -> fonctions génériques pour pouvoir utiliser n'importe quel modèle sans avoir à importer les      libraries spécifiques
     52 #model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
     53 #tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
---> 54 trainer = Trainer(model_name = "Roberta",
     55                   training_column = 'extrait',
     56                   checkpoint_path = "pretrained_models/roberta/large/",
     57                   save_path = "models/",
     58                   training_dataset_path = "data/02-2022_medium_training.csv")

File ~/twipper/twipper_v3/src/trainer.py:75, in Trainer.__init__(self, model_name, training_column, checkpoint_path, save_path, training_dataset_path, option)
     73 df = twipper_data.get_dataframe()
     74 #print(df.columns)
---> 75 dataloaders = datablock.dataloaders(source = twipper_data.get_dataframe(),
     76                                     bs = batch_size,
     77                                     seed = 0)
     78 """dataloader = TextDataLoaders.from_df(df = twipper_data.get_dataframe(),
     79                                      bs = batch_size,
     80                                      text_col = training_column,
     81                                      label_col = '_Labels',
     82                                      label_delim=' ',
     83                                      valid_pct = 0.5)"""
     85 #on crée le learner
     86     #objet qui va gérer le calcul du gradient, l'évolution des poids etc...

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/block.py:113, in DataBlock.dataloaders(self, source, path, verbose, **kwargs)
    112 def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 113     dsets = self.datasets(source, verbose=verbose)
    114     kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
    115     return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/block.py:110, in DataBlock.datasets(self, source, verbose)
    108 splits = (self.splitter or RandomSplitter())(items)
    109 pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 110 return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:334, in Datasets.__init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
    332 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    333     super().__init__(dl_type=dl_type)
--> 334     self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    335     self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:334, in <listcomp>(.0)
    332 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    333     super().__init__(dl_type=dl_type)
--> 334     self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    335     self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/foundation.py:97, in _L_Meta.__call__(cls, x, *args, **kwargs)
     95 def __call__(cls, x=None, *args, **kwargs):
     96     if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97     return super().__call__(x, *args, **kwargs)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:257, in TfmdLists.__init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
    255 if do_setup:
    256     pv(f"Setting up {self.tfms}", verbose)
--> 257     self.setup(train_setup=train_setup)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/core.py:276, in TfmdLists.setup(self, train_setup)
    275 def setup(self, train_setup=True):
--> 276     self.tfms.setup(self, train_setup)
    277     if len(self) != 0:
    278         x = super().__getitem__(0) if self.splits is None else super().__getitem__(self.splits[0])[0]

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/transform.py:192, in Pipeline.setup(self, items, train_setup)
    190 tfms = self.fs[:]
    191 self.fs.clear()
--> 192 for t in tfms: self.add(t,items, train_setup)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/transform.py:196, in Pipeline.add(self, ts, items, train_setup)
    194 def add(self,ts, items=None, train_setup=False):
    195     if not is_listy(ts): ts=[ts]
--> 196     for t in ts: t.setup(items, train_setup)
    197     self.fs+=ts
    198     self.fs = self.fs.sorted(key='order')

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/transform.py:79, in Transform.setup(self, items, train_setup)
     77 def setup(self, items=None, train_setup=False):
     78     train_setup = train_setup if self.train_setup is None else self.train_setup
---> 79     return self.setups(getattr(items, 'train', items) if train_setup else items)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/dispatch.py:123, in TypeDispatch.__call__(self, *args, **kwargs)
    121 elif self.inst is not None: f = MethodType(f, self.inst)
    122 elif self.owner is not None: f = MethodType(f, self.owner)
--> 123 return f(*args, **kwargs)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/transforms.py:251, in Categorize.setups(self, dsets)
    250 def setups(self, dsets):
--> 251     if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets, sort=self.sort, add_na=self.add_na)
    252     self.c = len(self.vocab)

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastai/data/transforms.py:227, in CategoryMap.__init__(self, col, sort, add_na, strict)
    225     if not hasattr(col,'unique'): col = L(col, use_list=True)
    226     # `o==o` is the generalized definition of non-NaN used by Pandas
--> 227     items = L(o for o in col.unique() if o==o)
    228     if sort: items = items.sorted()
    229 self.items = '#na#' + items if add_na else items

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/foundation.py:163, in L.unique(self, sort, bidir, start)
--> 163 def unique(self, sort=False, bidir=False, start=None): return L(uniqueify(self, sort=sort, bidir=bidir, start=start))

File ~/venvs/twipper_v3/lib64/python3.8/site-packages/fastcore/basics.py:662, in uniqueify(x, sort, bidir, start)
    660 def uniqueify(x, sort=False, bidir=False, start=None):
    661     "Unique elements in `x`, optional `sort`, optional return reverse correspondence, optional prepend with elements."
--> 662     res = list(dict.fromkeys(x))
    663     if start is not None: res = listify(start)+res
    664     if sort: res.sort()

TypeError: unhashable type: 'list'

My dataset is composed of a column “extrait” ( that’s the input text) and a column “_Labels” ( which is a string of labels seperated by a space)

Hi @SaveYourLifes ,

Since you’re trying to solve a multi-label problem, you need to define your datablock accordingly.

datablock = DataBlock(blocks = [text_block, MultiCategoryBlock],
                   get_x=ColReader(twipper_data.input_col),
                   get_y=ColReader('_Labels', label_delim=' '),
                   splitter=ColSplitter())

Your output type should be a MultiCategoryBlock instead of a CategoryBlock. Hope this helps.

Thanks,
Vinayak.