I’m having a difficult time understanding how to use a custom text tokenizer using the tok_tfm parameter:
My goal is simply to replicate the functionality from fastai version 1.
Fast AI V1:
all_letters = list(string.printable + string.whitespace)
customtokenizer = Tokenizer(pre_rules= [], post_rules=[])
processors = [TokenizeProcessor(tokenizer=customtokenizer, mark_fields=False),
NumericalizeProcessor(vocab=Vocab.create(all_letters, max_vocab=1000, min_freq=0))]
data = (TextList.from_csv(path, “songs_8.csv”, cols=‘text’, processor=processors)
.split_by_rand_pct(0.2)
.label_for_lm()
.databunch(bs=96))
data.save(‘data_block_lm4.pkl’)
Fast AI V2:
dls = TextDataLoaders.from_csv(path=path, bs=8, header=“infer”, csv_fname=‘songs_8.csv’, text_col=‘text’, label_col=‘label’, is_lm=True, tok_tfm=noop)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in
----> 1 dls.show_batch()
~\anaconda3\lib\site-packages\fastai\data\core.py in show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
100 if b is None: b = self.one_batch()
101 if not show: return self._pre_show_batch(b, max_n=max_n)
–> 102 show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
103 if unique: self.get_idxs = old_get_idxs
104
~\anaconda3\lib\site-packages\fastcore\dispatch.py in call(self, *args, **kwargs)
108 if not f: return args[0]
109 if self.inst is not None: f = MethodType(f, self.inst)
–> 110 return f(*args, **kwargs)
111
112 def get(self, inst, owner):
~\anaconda3\lib\site-packages\fastai\text\data.py in show_batch(x, y, samples, ctxs, max_n, trunc_at, **kwargs)
118 @typedispatch
119 def show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
–> 120 samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at)) for s in samples)
121 return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, trunc_at=None, **kwargs)
122
~\anaconda3\lib\site-packages\fastcore\foundation.py in call(cls, x, args, **kwargs)
49 return x
50
—> 51 res = super().call(((x,) + args), **kwargs)
52 res._newchk = 0
53 return res
~\anaconda3\lib\site-packages\fastcore\foundation.py in init(self, items, use_list, match, *rest)
331 if items is None: items = []
332 if (use_list is not None) or not _is_array(items):
–> 333 items = list(items) if use_list else _listify(items)
334 if match is not None:
335 if is_coll(match): match = len(match)
~\anaconda3\lib\site-packages\fastcore\foundation.py in _listify(o)
244 if isinstance(o, list): return o
245 if isinstance(o, str) or _is_array(o): return [o]
–> 246 if is_iter(o): return list(o)
247 return [o]
248
~\anaconda3\lib\site-packages\fastai\text\data.py in (.0)
118 @typedispatch
119 def show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
–> 120 samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at)) for s in samples)
121 return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, trunc_at=None, **kwargs)
122
AttributeError: ‘L’ object has no attribute ‘truncate’