I’m working on a model to output fake code samples. As part of that, I’ll need to use some custom transforms to tokenize the code (eventually, I’d like to use a compiler tokenizer but I’m just prototyping for now).
I’m running into issues in getting my transform pipeline to work. Does anyone know why this code might be giving me this error: AttributeError: 'PosixPath' object has no attribute 'split'
:
path = Path('data/lisp')
files = get_text_files(path)
bs = 70
sl = 16
cut = int(len(files)*0.8)
splits = [list(range(cut)), list(range(cut,len(files)))]
class TxtFromFile(Transform):
def encodes(self, x):
with open(x) as f: return f.read()
class CodeTokenizer(Transform):
def encodes(self, x): return x.split(' ')
tfms = [TxtFromFile, CodeTokenizer, Numericalize]
dsets = Datasets(files, tfms=tfms, splits=splits, dl_type=LMDataLoader)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-19-0b2b1949621f> in <module>
----> 1 dsets = Datasets(files, tfms=tfms, splits=splits, dl_type=LMDataLoader)
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
308 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
309 super().__init__(dl_type=dl_type)
--> 310 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
311 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
312
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in <listcomp>(.0)
308 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
309 super().__init__(dl_type=dl_type)
--> 310 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
311 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
312
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
95 def __call__(cls, x=None, *args, **kwargs):
96 if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97 return super().__call__(x, *args, **kwargs)
98
99 # Cell
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
234 if do_setup:
235 pv(f"Setting up {self.tfms}", verbose)
--> 236 self.setup(train_setup=train_setup)
237
238 def _new(self, items, split_idx=None, **kwargs):
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastai/data/core.py in setup(self, train_setup)
256 for f in self.tfms.fs:
257 self.types.append(getattr(f, 'input_types', type(x)))
--> 258 x = f(x)
259 self.types.append(type(x))
260 types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/transform.py in __call__(self, x, **kwargs)
71 @property
72 def name(self): return getattr(self, '_name', _get_name(self))
---> 73 def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
74 def decode (self, x, **kwargs): return self._call('decodes', x, **kwargs)
75 def __repr__(self): return f'{self.name}:\nencodes: {self.encodes}decodes: {self.decodes}'
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
81 def _call(self, fn, x, split_idx=None, **kwargs):
82 if split_idx!=self.split_idx and self.split_idx is not None: return x
---> 83 return self._do_call(getattr(self, fn), x, **kwargs)
84
85 def _do_call(self, f, x, **kwargs):
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/transform.py in _do_call(self, f, x, **kwargs)
87 if f is None: return x
88 ret = f.returns(x) if hasattr(f,'returns') else None
---> 89 return retain_type(f(x, **kwargs), x, ret)
90 res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
91 return retain_type(res, x)
~/proj/typeperf-models/venv/lib/python3.8/site-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
116 elif self.inst is not None: f = MethodType(f, self.inst)
117 elif self.owner is not None: f = MethodType(f, self.owner)
--> 118 return f(*args, **kwargs)
119
120 def __get__(self, inst, owner):
<ipython-input-17-72ee3c98b27d> in encodes(self, x)
1 class CodeTokenizer(Transform):
----> 2 def encodes(self, x): return x.split(' ')
AttributeError: 'PosixPath' object has no attribute 'split'
My data setup is that I have a folder full of list code in data/lisp where each lisp file has a .txt extension.