Hi Jay,
I’m following lesson 10 steps with a big corpus too and ran into the same problem as yours. What I did to solve it was to copy/paste fastai’s SpacyTokenizer
into my notebook, tune it to use nlp.max_length = 25_000_000
, wrap it in a Tokenizer
and then manually create a TextBlock
passing the Tokenizer
object.
Here’s my code:
from fastai.text.all import *
import spacy
from spacy.symbols import ORTH
# copied from https://github.com/fastai/fastai/blob/ab0c2fe0d54895ddca27b91eb128b8599ba140d3/fastai/text/core.py#L113
class SpacyTokenizer25Mil():
"Spacy tokenizer for `lang`"
def __init__(self, lang='en', special_toks=None, buf_sz=5000):
self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
nlp.max_length = 25_000_000
for w in self.special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
self.pipe,self.buf_sz = nlp.pipe,buf_sz
def __call__(self, items):
return (L(doc).attrgot('text') for doc in self.pipe(map(str,items), batch_size=self.buf_sz))
tkn = Tokenizer(SpacyTokenizer25Mil())
get_files_func = partial(get_files, extensions=['.log'])
dls_lm = DataBlock(
blocks=TextBlock(tok_tfm=tkn, is_lm=True, min_freq=3, max_vocab=60000),
get_items=get_files_func, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)
Hope it helps