I have a dataset of text sequences. I would like to perform batch prediction on a test dataset.
- I am following along with this notebook to train my language model and text classifier.
- I am also following the answer from this post for batch prediction.
First, I define the tokenizer:
bpe_tokenizer = SentencePieceTokenizer(
lang=None,
sp_model=f"{MODELS_DIR}/spm.model",
vocab_sz=<REDACTED>,
model_type="bpe",
cache_dir=MODELS_DIR,
)
Next, I train the language model:
language_model_text_block = TextBlock.from_df(
text_cols=[SEQUENCE_COL],
is_lm=True,
tok=bpe_tokenizer,
n_workers=8,
)
language_model_data_block = DataBlock(
blocks=language_model_text_block,
get_x=ColReader("text"),
)
language_model_dataloaders = language_model_data_block.dataloaders(
language_model_data,
bs=128,
seq_len=80,
)
lstm_lm_configuration = awd_lstm_lm_config.copy()
lstm_lm_configuration["emb_sz"] = <REDACTED>
lstm_lm_configuration["n_hid"] = <REDACTED>
lstm_lm_configuration["n_layers"] = <REDACTED>
lstm_lm_configuration["bidir"] = True
language_model = language_model_learner(
dls=language_model_dataloaders,
arch=AWD_LSTM,
config=lstm_lm_configuration,
pretrained=False,
metrics=[accuracy, Perplexity()],
model_dir=MODELS_DIR,
).to_fp16()
This trains successfully. I saved the encoder.
Next, I load the encoder from the language model and train the text classifier:
classifier_text_block = TextBlock.from_df(
text_cols=[SEQUENCE_COL],
vocab=language_model_dataloaders.vocab,
tok=bpe_tokenizer,
n_workers=8,
)
classifier_data_block = DataBlock(
blocks=(classifier_text_block, CategoryBlock),
get_x=ColReader("text"),
get_y=ColReader(TARGET_COL),
splitter=TrainTestSplitter(test_size=0.2, stratify=training_data[TARGET_COL]),
)
classifier_dataloaders = classifier_data_block.dataloaders(
training_data,
bs=128,
seq_len=80
)
lstm_clas_configuration = awd_lstm_clas_config.copy()
lstm_clas_configuration["emb_sz"] = <REDACTED>
lstm_clas_configuration["n_hid"] = <REDACTED>
lstm_clas_configuration["n_layers"] = <REDACTED>
lstm_clas_configuration["bidir"] = True
classifier = text_classifier_learner(
dls=classifier_dataloaders,
arch=AWD_LSTM,
seq_len=80,
config=lstm_clas_configuration,
pretrained=False,
metrics=[accuracy],
model_dir=MODELS_DIR,
).to_fp16()
This also trains successfully. I saved the model.
Finally, I attempt to perform batch prediction:
test_dataloaders = classifier.dls.test_dl(test_dataframe)
probabilities, targets = classifier.get_preds(dl=test_dataloaders)
But this fails with the following stack trace:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-34-c9fec5620b4f> in <module>
----> 1 test_dataloaders = classifier.dls.test_dl(test_dataframe)
2 probabilities, targets = classifier.get_preds(dl=test_dataloaders)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/core.py in test_dl(self, test_items, rm_type_tfms, with_labels, **kwargs)
381 test_ds = test_set(self.valid_ds, test_items, rm_tfms=rm_type_tfms, with_labels=with_labels
382 ) if isinstance(self.valid_ds, (Datasets, TfmdLists)) else test_items
--> 383 return self.valid.new(test_ds, **kwargs)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/text/data.py in new(self, dataset, **kwargs)
186 if 'val_res' in kwargs and kwargs['val_res'] is not None: res = kwargs['val_res']
187 else: res = self.res if dataset is None else None
--> 188 return super().new(dataset=dataset, res=res, **kwargs)
189
190 # Cell
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/core.py in new(self, dataset, cls, **kwargs)
62 @delegates(DataLoader.new)
63 def new(self, dataset=None, cls=None, **kwargs):
---> 64 res = super().new(dataset, cls, do_setup=False, **kwargs)
65 if not hasattr(self, '_n_inp') or not hasattr(self, '_types'):
66 try:
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/load.py in new(self, dataset, cls, **kwargs)
118 bs=self.bs, shuffle=self.shuffle, drop_last=self.drop_last, indexed=self.indexed, device=self.device)
119 for n in self._methods: cur_kwargs[n] = getattr(self, n)
--> 120 return cls(**merge(cur_kwargs, kwargs))
121
122 @property
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/text/data.py in __init__(self, dataset, sort_func, res, **kwargs)
159 self.sort_func = _default_sort if sort_func is None else sort_func
160 if res is None and self.sort_func == _default_sort: res = _get_lengths(dataset)
--> 161 self.res = [self.sort_func(self.do_item(i)) for i in range_of(self.dataset)] if res is None else res
162 if len(self.res) > 0: self.idx_max = np.argmax(self.res)
163
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/text/data.py in <listcomp>(.0)
159 self.sort_func = _default_sort if sort_func is None else sort_func
160 if res is None and self.sort_func == _default_sort: res = _get_lengths(dataset)
--> 161 self.res = [self.sort_func(self.do_item(i)) for i in range_of(self.dataset)] if res is None else res
162 if len(self.res) > 0: self.idx_max = np.argmax(self.res)
163
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/load.py in do_item(self, s)
123 def prebatched(self): return self.bs is None
124 def do_item(self, s):
--> 125 try: return self.after_item(self.create_item(s))
126 except SkipItemException: return None
127 def chunkify(self, b): return b if self.prebatched else chunked(b, self.bs, self.drop_last)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/load.py in create_item(self, s)
129 def randomize(self): self.rng = random.Random(self.rng.randint(0,2**32-1))
130 def retain(self, res, b): return retain_types(res, b[0] if is_listy(b) else b)
--> 131 def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
132 def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
133 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/core.py in __getitem__(self, it)
312
313 def __getitem__(self, it):
--> 314 res = tuple([tl[it] for tl in self.tls])
315 return res if is_indexer(it) else list(zip(*res))
316
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/core.py in <listcomp>(.0)
312
313 def __getitem__(self, it):
--> 314 res = tuple([tl[it] for tl in self.tls])
315 return res if is_indexer(it) else list(zip(*res))
316
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/core.py in __getitem__(self, idx)
278 res = super().__getitem__(idx)
279 if self._after_item is None: return res
--> 280 return self._after_item(res) if is_indexer(idx) else res.map(self._after_item)
281
282 # Cell
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/core.py in _after_item(self, o)
240 return super()._new(items, tfms=self.tfms, do_setup=False, types=self.types, split_idx=split_idx, **kwargs)
241 def subset(self, i): return self._new(self._get(self.splits[i]), split_idx=i)
--> 242 def _after_item(self, o): return self.tfms(o)
243 def __repr__(self): return f"{self.__class__.__name__}: {self.items}\ntfms - {self.tfms.fs}"
244 def __iter__(self): return (self[i] for i in range(len(self)))
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastcore/transform.py in __call__(self, o)
195 self.fs.append(t)
196
--> 197 def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
198 def __repr__(self): return f"Pipeline: {' -> '.join([f.name for f in self.fs if f.name != 'noop'])}"
199 def __getitem__(self,i): return self.fs[i]
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastcore/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
147 for f in tfms:
148 if not is_enc: f = f.decode
--> 149 x = f(x, **kwargs)
150 return x
151
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/transforms.py in __call__(self, o, **kwargs)
197
198 def __call__(self, o, **kwargs):
--> 199 if len(self.cols) == 1: return self._do_one(o, self.cols[0])
200 return L(self._do_one(o, c) for c in self.cols)
201
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/data/transforms.py in _do_one(self, r, c)
191
192 def _do_one(self, r, c):
--> 193 o = r[c] if isinstance(c, int) else r[c] if c=='name' else getattr(r, c)
194 if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
195 if self.label_delim is None: return f'{self.pref}{o}{self.suff}'
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5273 return self[name]
-> 5274 return object.__getattribute__(self, name)
5275
5276 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'text'
Based on the error, it seems like the test dataset is not being tokenized, so the text
column isnât being created. Thatâs why it isnât found. (Is this assumption correct?)
Given the above assumption, I also tried this approach to tokenize the test data when creating test_dataloaders
:
test_data_block = DataBlock(
blocks=classifier_text_block,
get_x=ColReader("text"),
)
test_dataloaders = test_data_block.dataloaders(
test_dataframe,
bs=128,
seq_len=80
)
probabilities, targets = classifier.get_preds(dl=test_dataloaders)
But this failed with the following stack trace:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-23-dcb0957505de> in <module>
----> 1 probabilities, targets = classifier.get_preds(dl=test_dataloaders)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, inner, reorder, cbs, n_workers, **kwargs)
240 res[pred_i] = act(res[pred_i])
241 if with_decoded: res.insert(pred_i+2, getattr(self.loss_func, 'decodes', noop)(res[pred_i]))
--> 242 if reorder and hasattr(dl, 'get_idxs'): res = nested_reorder(res, tensor(idxs).argsort())
243 return tuple(res)
244 self._end_cleanup()
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/fastai/torch_core.py in tensor(x, *rest, **kwargs)
125 else _array2tensor(x) if isinstance(x, ndarray)
126 else as_tensor(x.values, **kwargs) if isinstance(x, (pd.Series, pd.DataFrame))
--> 127 else as_tensor(x, **kwargs) if hasattr(x, '__array__') or is_iter(x)
128 else _array2tensor(array(x), **kwargs))
129 if res.dtype is torch.float64: return res.float()
RuntimeError: Could not infer dtype of iterator
Itâs not clear to me what the correct approach is to perform batch prediction on my test dataset. Can anyone point me in the right direction? (@stefan-ai, any ideas?)