I’m trying to create a byte-pair encoding as follows:
tokenizer = SentencePieceTokenizer(
lang=None,
vocab_sz=1000,
model_type="bpe",
)
tokenized = tokenize_df(
df=training_dataframe,
text_cols=[SEQUENCE_COL],
tok=tokenizer,
)
But I’m getting the following error:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-7-8be0e8d810a7> in <module>
2 df=training_dataframe,
3 text_cols=[SEQUENCE_COL],
----> 4 tok=tokenizer,
5 )
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/text/core.py in tokenize_df(df, text_cols, n_workers, rules, mark_fields, tok, res_col_name)
224 res[res_col_name] = outputs
225 res[f'{res_col_name}_length'] = [len(o) for o in outputs]
--> 226 return res,Counter(outputs.concat())
227
228 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in concat(self)
422 def map_zip(self, f, *args, cycled=False, **kwargs): return self.zip(cycled=cycled).starmap(f, *args, **kwargs)
423 def map_zipwith(self, f, *rest, cycled=False, **kwargs): return self.zipwith(*rest, cycled=cycled).starmap(f, **kwargs)
--> 424 def concat(self): return self._new(itertools.chain.from_iterable(self.map(L)))
425 def shuffle(self):
426 it = copy(self.items)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in _new(self, items, *args, **kwargs)
340 @property
341 def _xtra(self): return None
--> 342 def _new(self, items, *args, **kwargs): return type(self)(items, *args, use_list=None, **kwargs)
343 def __getitem__(self, idx): return self._get(idx) if is_indexer(idx) else L(self._get(idx), use_list=None)
344 def copy(self): return self._new(self.items.copy())
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
49 return x
50
---> 51 res = super().__call__(*((x,) + args), **kwargs)
52 res._newchk = 0
53 return res
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in __init__(self, items, use_list, match, *rest)
331 if items is None: items = []
332 if (use_list is not None) or not _is_array(items):
--> 333 items = list(items) if use_list else _listify(items)
334 if match is not None:
335 if is_coll(match): match = len(match)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastcore/foundation.py in _listify(o)
244 if isinstance(o, list): return o
245 if isinstance(o, str) or _is_array(o): return [o]
--> 246 if is_iter(o): return list(o)
247 return [o]
248
MemoryError:
There is no additional message or information with the MemoryError
. Is this simply an out-of-memory error? Do I need to run this with a machine that has more RAM, or is there some flag I need to set to avoid this error? Or could this be a bug in fastcore
? I’d appreciate any guidance here!