How can I increase the SpacyTokenizer max_length?

cristian.c · March 14, 2021, 8:50am

Hi Jay,

I’m following lesson 10 steps with a big corpus too and ran into the same problem as yours. What I did to solve it was to copy/paste fastai’s SpacyTokenizer into my notebook, tune it to use nlp.max_length = 25_000_000, wrap it in a Tokenizer and then manually create a TextBlock passing the Tokenizer object.

Here’s my code:

from fastai.text.all import *
import spacy
from spacy.symbols import ORTH

# copied from https://github.com/fastai/fastai/blob/ab0c2fe0d54895ddca27b91eb128b8599ba140d3/fastai/text/core.py#L113
class SpacyTokenizer25Mil():
    "Spacy tokenizer for `lang`"
    def __init__(self, lang='en', special_toks=None, buf_sz=5000):
        self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
        nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
        nlp.max_length = 25_000_000
        for w in self.special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pipe,self.buf_sz = nlp.pipe,buf_sz

    def __call__(self, items):
        return (L(doc).attrgot('text') for doc in self.pipe(map(str,items), batch_size=self.buf_sz))

tkn = Tokenizer(SpacyTokenizer25Mil())

get_files_func = partial(get_files, extensions=['.log'])

dls_lm = DataBlock(
    blocks=TextBlock(tok_tfm=tkn, is_lm=True, min_freq=3, max_vocab=60000),
    get_items=get_files_func, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

Hope it helps