Hello, I want to take a pretrained HuggingFace Transformer and fine-tune it to my own use-case (let’s say IMDB) using the FastAI framework. Although there are other HF implementations of this, I want to do this to learn!
So far I am stuck on how to create a IMDB Text DataLoader for the HF model. I’m not sure how to create a proper TextBlock to feed into HF.
I saw there is a tutorial on the fastai website - fastai - Transformers - which uses a DataFrame of text. How would I do this when the text is like the imdb
folder in fastai - e.g. individual text files with the folder name as the label?
Any suggestions are greatly appreciated.
Edit: Here is my code
Code
class MyHFTokenTransform(ItemTransform):
def encodes(self, text_path: Path):
label = parent_label(text_path)
return tensor(self.tokenize(text_path)), label
def decodes(self, inputs):
return TitledStr(tokenizer.decode(inputs[0].cpu().numpy()))
def tokenize(self, text_path):
with open(text_path, 'rb') as f:
text = f.read()
return tokenizer(str(text), max_length=77)['input_ids']
dls = TfmdLists([text_path/'train/pos/2500_9.txt', text_path/'train/pos/8127_8.txt'], [MyHFTokenTransform, ToTensor()]).dataloaders(bs=2)
dls.show_batch(max_n=2)
Output
RecursionError Traceback (most recent call last)
Input In [14], in <cell line: 2>()
1 dls = TfmdLists([text_path/'train/pos/2500_9.txt', text_path/'train/pos/8127_8.txt'], [MyHFTokenTransform, ToTensor()]).dataloaders(bs=2)
----> 2 dls.show_batch(max_n=2)
File /usr/local/lib/python3.9/dist-packages/fastai/data/core.py:151, in TfmdDL.show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
149 if b is None: b = self.one_batch()
150 if not show: return self._pre_show_batch(b, max_n=max_n)
--> 151 show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
152 if unique: self.get_idxs = old_get_idxs
File /usr/local/lib/python3.9/dist-packages/fastai/data/core.py:133, in TfmdDL._pre_show_batch(self, b, max_n)
131 b = self.decode(b)
132 if hasattr(b, 'show'): return b,None,None
--> 133 its = self._decode_batch(b, max_n, full=False)
134 if not is_listy(b): b,its = [b],L((o,) for o in its)
135 return detuplify(b[:self.n_inp]),detuplify(b[self.n_inp:]),its
File /usr/local/lib/python3.9/dist-packages/fastai/data/core.py:127, in TfmdDL._decode_batch(self, b, max_n, full)
125 f1 = self.before_batch.decode
126 f = compose(f1, f, partial(getcallable(self.dataset,'decode'), full = full))
--> 127 return L(batch_to_samples(b, max_n=max_n)).map(f)
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:670, in batch_to_samples(b, max_n)
668 if isinstance(b, Tensor): return retain_types(list(b[:max_n]), [b])
669 else:
--> 670 res = L(b).map(partial(batch_to_samples,max_n=max_n))
671 return retain_types(res.zip(), [b])
File /usr/local/lib/python3.9/dist-packages/fastcore/foundation.py:156, in L.map(self, f, gen, *args, **kwargs)
--> 156 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:840, in map_ex(iterable, f, gen, *args, **kwargs)
838 res = map(g, iterable)
839 if gen: return res
--> 840 return list(res)
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:825, in bind.__call__(self, *args, **kwargs)
823 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
824 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 825 return self.func(*fargs, **kwargs)
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:670, in batch_to_samples(b, max_n)
668 if isinstance(b, Tensor): return retain_types(list(b[:max_n]), [b])
669 else:
--> 670 res = L(b).map(partial(batch_to_samples,max_n=max_n))
671 return retain_types(res.zip(), [b])
File /usr/local/lib/python3.9/dist-packages/fastcore/foundation.py:156, in L.map(self, f, gen, *args, **kwargs)
--> 156 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:840, in map_ex(iterable, f, gen, *args, **kwargs)
838 res = map(g, iterable)
839 if gen: return res
--> 840 return list(res)
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:825, in bind.__call__(self, *args, **kwargs)
823 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
824 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 825 return self.func(*fargs, **kwargs)
[... skipping similar frames: batch_to_samples at line 670 (492 times), bind.__call__ at line 825 (491 times), L.map at line 156 (491 times), map_ex at line 840 (491 times)]
File /usr/local/lib/python3.9/dist-packages/fastcore/foundation.py:156, in L.map(self, f, gen, *args, **kwargs)
--> 156 def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:840, in map_ex(iterable, f, gen, *args, **kwargs)
838 res = map(g, iterable)
839 if gen: return res
--> 840 return list(res)
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:825, in bind.__call__(self, *args, **kwargs)
823 if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
824 fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 825 return self.func(*fargs, **kwargs)
File /usr/local/lib/python3.9/dist-packages/fastai/torch_core.py:670, in batch_to_samples(b, max_n)
668 if isinstance(b, Tensor): return retain_types(list(b[:max_n]), [b])
669 else:
--> 670 res = L(b).map(partial(batch_to_samples,max_n=max_n))
671 return retain_types(res.zip(), [b])
File /usr/local/lib/python3.9/dist-packages/fastcore/foundation.py:98, in _L_Meta.__call__(cls, x, *args, **kwargs)
96 def __call__(cls, x=None, *args, **kwargs):
97 if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 98 return super().__call__(x, *args, **kwargs)
File /usr/local/lib/python3.9/dist-packages/fastcore/foundation.py:106, in L.__init__(self, items, use_list, match, *rest)
104 def __init__(self, items=None, *rest, use_list=False, match=None):
105 if (use_list is not None) or not is_array(items):
--> 106 items = listify(items, *rest, use_list=use_list, match=match)
107 super().__init__(items)
File /usr/local/lib/python3.9/dist-packages/fastcore/basics.py:64, in listify(o, use_list, match, *rest)
62 if use_list: res = list(o)
63 elif o is None: res = []
---> 64 elif isinstance(o, list): res = o
65 elif isinstance(o, str) or is_array(o): res = [o]
66 elif is_iter(o): res = list(o)
RecursionError: maximum recursion depth exceeded while calling a Python object
NOTE: the recursion error goes away if I only return the token_ids
in encodes vs both the token_ids and the label