Custom transforms for a language model

I’m working on a model to output fake code samples. As part of that, I’ll need to use some custom transforms to tokenize the code (eventually, I’d like to use a compiler tokenizer but I’m just prototyping for now).

I’m running into issues in getting my transform pipeline to work. Does anyone know why this code might be giving me this error: AttributeError: 'PosixPath' object has no attribute 'split':

path = Path('data/lisp')
files = get_text_files(path)

bs = 70
sl = 16

cut = int(len(files)*0.8)
splits = [list(range(cut)), list(range(cut,len(files)))]

class TxtFromFile(Transform):
    def encodes(self, x):
        with open(x) as f: return f.read()

class CodeTokenizer(Transform):
    def encodes(self, x): return x.split(' ')

tfms = [TxtFromFile, CodeTokenizer, Numericalize]

dsets = Datasets(files, tfms=tfms, splits=splits, dl_type=LMDataLoader)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-19-0b2b1949621f> in <module>
----> 1 dsets = Datasets(files, tfms=tfms, splits=splits, dl_type=LMDataLoader)

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
    308     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    309         super().__init__(dl_type=dl_type)
--> 310         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    311         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    312 

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in <listcomp>(.0)
    308     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    309         super().__init__(dl_type=dl_type)
--> 310         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    311         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    312 

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     95     def __call__(cls, x=None, *args, **kwargs):
     96         if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97         return super().__call__(x, *args, **kwargs)
     98 
     99 # Cell

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
    234         if do_setup:
    235             pv(f"Setting up {self.tfms}", verbose)
--> 236             self.setup(train_setup=train_setup)
    237 
    238     def _new(self, items, split_idx=None, **kwargs):

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in setup(self, train_setup)
    256             for f in self.tfms.fs:
    257                 self.types.append(getattr(f, 'input_types', type(x)))
--> 258                 x = f(x)
    259             self.types.append(type(x))
    260         types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/transform.py in __call__(self, x, **kwargs)
     71     @property
     72     def name(self): return getattr(self, '_name', _get_name(self))
---> 73     def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
     74     def decode  (self, x, **kwargs): return self._call('decodes', x, **kwargs)
     75     def __repr__(self): return f'{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}'

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
     81     def _call(self, fn, x, split_idx=None, **kwargs):
     82         if split_idx!=self.split_idx and self.split_idx is not None: return x
---> 83         return self._do_call(getattr(self, fn), x, **kwargs)
     84 
     85     def _do_call(self, f, x, **kwargs):

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
     87             if f is None: return x
     88             ret = f.returns(x) if hasattr(f,'returns') else None
---> 89             return retain_type(f(x, **kwargs), x, ret)
     90         res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
     91         return retain_type(res, x)

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
    116         elif self.inst is not None: f = MethodType(f, self.inst)
    117         elif self.owner is not None: f = MethodType(f, self.owner)
--> 118         return f(*args, **kwargs)
    119 
    120     def __get__(self, inst, owner):

<ipython-input-17-72ee3c98b27d> in encodes(self, x)
      1 class CodeTokenizer(Transform):
----> 2     def encodes(self, x): return x.split(' ')

AttributeError: 'PosixPath' object has no attribute 'split'

My data setup is that I have a folder full of list code in data/lisp where each lisp file has a .txt extension.

TxtFromFile()(files[0])

Calling this transform on a file directly gives me what I want. (The contents of the file as a string which looks like this: ‘’;;;-- Mode: LISP; Package: :SAPA; Syntax: COMMON-LISP --\n;-----------------------------------------------------------------------------\n; © 1993, Donald B. Percival dbp@apl.washington.edu\n;\n; This code is licensed under the terms of the modified BSD license\n; (“sans advertising clause”). See the file COPYING for details.\n;\n; Comments about this software can be addressed to dbp@apl.washington.edu\n;----------------------------------------’)

Replacing tfms=tfms with [tfms] fixed the PosixPath error.

dsets = Datasets(files, [tfms], splits=splits, dl_type=LMDataLoader)

However, I’m now getting AttributeError: 'list' object has no attribute 'truncate' when I call dls.show_batch(max_n=2).

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-085b1158bedd> in <module>
----> 1 dls.show_batch(max_n=2)

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
    100         if b is None: b = self.one_batch()
    101         if not show: return self._pre_show_batch(b, max_n=max_n)
--> 102         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
    103         if unique: self.get_idxs = old_get_idxs
    104 

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
    116         elif self.inst is not None: f = MethodType(f, self.inst)
    117         elif self.owner is not None: f = MethodType(f, self.owner)
--> 118         return f(*args, **kwargs)
    119 
    120     def __get__(self, inst, owner):

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/text/data.py in show_batch(x, y, samples, ctxs, max_n, trunc_at, **kwargs)
    117 @typedispatch
    118 def show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
--> 119     samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at)) for s in samples)
    120     return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, trunc_at=None, **kwargs)
    121 

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     95     def __call__(cls, x=None, *args, **kwargs):
     96         if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97         return super().__call__(x, *args, **kwargs)
     98 
     99 # Cell

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
    103     def __init__(self, items=None, *rest, use_list=False, match=None):
    104         if (use_list is not None) or not is_array(items):
--> 105             items = listify(items, *rest, use_list=use_list, match=match)
    106         super().__init__(items)
    107 

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/basics.py in listify(o, use_list, match, *rest)
     54     elif isinstance(o, list): res = o
     55     elif isinstance(o, str) or is_array(o): res = [o]
---> 56     elif is_iter(o): res = list(o)
     57     else: res = [o]
     58     if match is not None:

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/text/data.py in <genexpr>(.0)
    117 @typedispatch
    118 def show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
--> 119     samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at)) for s in samples)
    120     return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, trunc_at=None, **kwargs)
    121 

~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/basics.py in __getattr__(self, k)
    386         if self._component_attr_filter(k):
    387             attr = getattr(self,self._default,None)
--> 388             if attr is not None: return getattr(attr,k)
    389         raise AttributeError(k)
    390     def __dir__(self): return custom_dir(self,self._dir())

AttributeError: 'list' object has no attribute 'truncate'

Looks like I need to pass a Tokenizer instance with the tokenizer behaviour specified with tok.

tfms = [TxtFromFile, Tokenizer(tok=BaseTokenizer()), Numericalize]

On to the next error!

I am getting a similar error when trying to trying to create a multi-label Datablock on the toxic comments dataset.

AttributeError: ‘list’ object has no attribute ‘truncate’

def get_y(r): return r[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
block = DataBlock(
  blocks=(TextBlock.from_df("comment_text", is_lm=True), MultiCategoryBlock),
  get_x=ColReader("text"),
  get_y=get_y,
  splitter=RandomSplitter(.1)
)

dls = block.dataloaders(train, bs=64)
dls.show_batch()

I have also tried using ColReader for get_y

ColReader(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])

This is how I import my data :

And then I do a train-test-split :

from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.05)
train.shape, val.shape

Any idea how I can change my DataBlock to avoid this error?