If I change res_col_name
does caching still happens? Iâm guessing not because ColReader
will still read from the non-tokenized column, example on IMDB_SAMPLE
splits = ColReader('is_valid')(df)
dset = Datasets(df, tfms=[[ColReader('text'), Tokenizer.from_df('text', res_col_name='tokens')]])
This is currently throwing an error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-5-444f1088f8d5> in <module>
----> 1 dset = Datasets(df, tfms=[[ColReader('text'), Tokenizer.from_df('text', res_col_name='tokens')]])
~/git/fastai2/fastai2/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
282 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
283 super().__init__(dl_type=dl_type)
--> 284 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
285 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
286
~/git/fastai2/fastai2/data/core.py in <listcomp>(.0)
282 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
283 super().__init__(dl_type=dl_type)
--> 284 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
285 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
286
~/git/fastcore/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
45 return x
46
---> 47 res = super().__call__(*((x,) + args), **kwargs)
48 res._newchk = 0
49 return res
~/git/fastai2/fastai2/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose)
220 if do_setup:
221 pv(f"Setting up {self.tfms}", verbose)
--> 222 self.setup(train_setup=train_setup)
223
224 def _new(self, items, split_idx=None, **kwargs):
~/git/fastai2/fastai2/data/core.py in setup(self, train_setup)
242 for f in self.tfms.fs:
243 self.types.append(getattr(f, 'input_types', type(x)))
--> 244 x = f(x)
245 self.types.append(type(x))
246 types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()
~/git/fastcore/fastcore/transform.py in __call__(self, x, **kwargs)
70 @property
71 def name(self): return getattr(self, '_name', _get_name(self))
---> 72 def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
73 def decode (self, x, **kwargs): return self._call('decodes', x, **kwargs)
74 def __repr__(self): return f'{self.name}: {self.encodes} {self.decodes}'
~/git/fastcore/fastcore/transform.py in _call(self, fn, x, split_idx, **kwargs)
80 def _call(self, fn, x, split_idx=None, **kwargs):
81 if split_idx!=self.split_idx and self.split_idx is not None: return x
---> 82 return self._do_call(getattr(self, fn), x, **kwargs)
83
84 def _do_call(self, f, x, **kwargs):
~/git/fastcore/fastcore/transform.py in _do_call(self, f, x, **kwargs)
84 def _do_call(self, f, x, **kwargs):
85 if not _is_tuple(x):
---> 86 return x if f is None else retain_type(f(x, **kwargs), x, f.returns_none(x))
87 res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
88 return retain_type(res, x)
~/git/fastcore/fastcore/dispatch.py in __call__(self, *args, **kwargs)
96 if not f: return args[0]
97 if self.inst is not None: f = MethodType(f, self.inst)
---> 98 return f(*args, **kwargs)
99
100 def __get__(self, inst, owner):
~/git/fastai2/fastai2/data/transforms.py in __call__(self, o, **kwargs)
197
198 def __call__(self, o, **kwargs):
--> 199 if len(self.cols) == 1: return self._do_one(o, self.cols[0])
200 return L(self._do_one(o, c) for c in self.cols)
201
~/git/fastai2/fastai2/data/transforms.py in _do_one(self, r, c)
191
192 def _do_one(self, r, c):
--> 193 o = r[c] if isinstance(c, int) else getattr(r, c)
194 if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
195 if self.label_delim is None: return f'{self.pref}{o}{self.suff}'
~/anaconda3/envs/dl/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
5177 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5178 return self[name]
-> 5179 return object.__getattribute__(self, name)
5180
5181 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'text'
The problem happens inside tokenize_df
:
other_cols = df.columns[~df.columns.isin(text_cols)]
res = df[other_cols].copy()
res[res_col_name] = outputs
I think we donât need to remove text_cols
, just override it if needed, can try something like:
other_cols = text_cols.filter(lambda o: o != res_col_name)
other_cols = df.columns[~df.columns.isin(other_cols)]
res = df[other_cols].copy()
res[res_col_name] = outputs
This would fix the bug, but caching will still not happen when res_col_name
is changed