Version: fastai==2.3.0
I am new to fastai, and I am trying to get predictions on some text data with a previously trained model. When I tokenize and infer on my pandas DataFrame containing multiple records with different texts, the inference runs fine. However, if I run it with only one sample, or with multiple samples that are the exact same text, or with multiple different but short samples, I am getting an error from get_preds()
that I believe is arising from when fastai is trying to iterate over my data. I have tried changing the batch size in my dataloader to see if that solves the problem, to no avail. Here is the code I am using with a batch size (bs
) of 1 set in my dataloader:
from fastai.text.all import *
learner = load_learner(model_path, cpu=False)
# Tokenize DataFrame
tok_inf_df = tokenize_df(**test**, "response")
# Get tokenized part (not Counter of subwords)
inf_dl = learner.dls.test_dl(tok_inf_df[0], num_workers=0, bs=1)
all_predictions = learner.get_preds(dl=inf_dl, reorder=False)
all_predictions
When I run this code with the following test
data, with two different samples, it runs fine:
test = pd.DataFrame([[100, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.'],
[200, 'In this chapter we learned of several different methods that you can use to debug the code in your Jupyter Notebook. I personally prefer to use Pythons pdb module, but you can use the IPython.core.debugger to get the same functionality and it could be better if you prefer to have syntax highlighting.']], columns=['response_index', 'response'])
# Tokenize DataFrame
tok_inf_df = tokenize_df(test, "response")
# Get tokenized part (not Counter of subwords)
inf_dl = learner.dls.test_dl(tok_inf_df[0], num_workers=0, bs=1)
all_predictions = learner.get_preds(dl=inf_dl, reorder=False)
all_predictions
Out[24]: (tensor([[6.5851e-02, 2.1294e-03, 5.7879e-04, 2.3800e-01, 6.3660e-01],
[8.2862e-02, 3.3055e-03, 1.6065e-03, 1.7200e-01, 7.2442e-01]]),
None)
However, in all of the following cases, I get an error. Here are the cases of test
data I am trying:
One text:
test = pd.DataFrame([[100, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.']], columns=['response_index', 'response'])
Two similar texts:
test = pd.DataFrame([[100, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.'],
[200, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.']], columns=['response_index', 'response'])
Three similar texts:
test = pd.DataFrame([[100, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.'],
[200, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.'],
[300, 'This will start the debugging session immediately when you run the cell. What that means is that you would want to use some of the commands that pdb supports to step into the code and examine the function or variables as needed.']], columns=['response_index', 'response'])
Two different, but short texts:
test = pd.DataFrame([[100, 'go away'],
[200, 'come here']], columns=['response_index', 'response'])
And here is the full stack trace of the error:
------------------------------------| 0.00% [0/2 00:00<?]
ValueError: max() arg is an empty sequence
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<command-4247368361270238> in <module>
----> 1 all_predictions = learner.get_preds(dl=inf_dl, reorder=False)
2 all_predictions
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/learner.py in get_preds(self, ds_idx, dl, with_input, with_decoded, with_loss, act, inner, reorder, cbs, **kwargs)
242 if with_loss: ctx_mgrs.append(self.loss_not_reduced())
243 with ContextManagers(ctx_mgrs):
--> 244 self._do_epoch_validate(dl=dl)
245 if act is None: act = getattr(self.loss_func, 'activation', noop)
246 res = cb.all_tensors()
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_validate(self, ds_idx, dl)
192 if dl is None: dl = self.dls[ds_idx]
193 self.dl = dl
--> 194 with torch.no_grad(): self._with_events(self.all_batches, 'validate', CancelValidException)
195
196 def _do_epoch(self):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
158
159 def _with_events(self, f, event_type, ex, final=noop):
--> 160 try: self(f'before_{event_type}'); f()
161 except ex: self(f'after_cancel_{event_type}')
162 self(f'after_{event_type}'); final()
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
164 def all_batches(self):
165 self.n_iter = len(self.dl)
--> 166 for o in enumerate(self.dl): self.one_batch(*o)
167
168 def _do_one_batch(self):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/data/load.py in __iter__(self)
107 self.before_iter()
108 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
--> 109 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
110 # fix issue 2899. If the process start method isn't fork, the data will be copied to cuda in learner one_batch.
111 if self.device is not None and multiprocessing.get_start_method().lower() == "fork":
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
433 if self._sampler_iter is None:
434 self._reset()
--> 435 data = self._next_data()
436 self._num_yielded += 1
437 if self._dataset_kind == _DatasetKind.Iterable and \
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
473 def _next_data(self):
474 index = self._next_index() # may raise StopIteration
--> 475 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
476 if self._pin_memory:
477 data = _utils.pin_memory.pin_memory(data)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
32 raise StopIteration
33 else:
---> 34 data = next(self.dataset_iter)
35 return self.collate_fn(data)
36
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/data/load.py in create_batches(self, samps)
118 if self.dataset is not None: self.it = iter(self.dataset)
119 res = filter(lambda o:o is not None, map(self.do_item, samps))
--> 120 yield from map(self.do_batch, self.chunkify(res))
121
122 def new(self, dataset=None, cls=None, **kwargs):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/data/load.py in do_batch(self, b)
144 else: raise IndexError("Cannot index an iterable dataset numerically - must use `None`.")
145 def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
--> 146 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
147 def to(self, device): self.device = device
148 def one_batch(self):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastcore/transform.py in __call__(self, o)
198 self.fs = self.fs.sorted(key='order')
199
--> 200 def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
201 def __repr__(self): return f"Pipeline: {' -> '.join([f.name for f in self.fs if f.name != 'noop'])}"
202 def __getitem__(self,i): return self.fs[i]
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastcore/transform.py in compose_tfms(x, tfms, is_enc, reverse, **kwargs)
148 for f in tfms:
149 if not is_enc: f = f.decode
--> 150 x = f(x, **kwargs)
151 return x
152
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/text/data.py in __call__(self, b, **kwargs)
170 self.max_len = max([x.shape[0] for xs in b for x in xs if isinstance(x,TensorText)])
171 def __call__(self, b, **kwargs):
--> 172 self.before_call(b)
173 return super().__call__(tuple(b), **kwargs)
174 def encodes(self, x:TensorText):
/local_disk0/.ephemeral_nfs/envs/pythonEnv-fb8777ce-4e3a-4b30-a336-9afb5ab86d92/lib/python3.8/site-packages/fastai/text/data.py in before_call(self, b)
168 def before_call(self, b):
169 "Set `self.max_len` before encodes"
--> 170 self.max_len = max([x.shape[0] for xs in b for x in xs if isinstance(x,TensorText)])
171 def __call__(self, b, **kwargs):
172 self.before_call(b)
ValueError: max() arg is an empty sequence
Iām not sure where to go from here because I am unable to debug further in my environment. Is this a problem with my model, or something else?