SentencePieceTokenizer + fastai = Runtime error: [!sentences_.empty()]

forced · June 30, 2023, 1:45pm

Hi guys,

I’m trying to write simple text generator (like Jeremy did in lesson 4th) but with using SentencePieceTokenizer. The problem is that during creating DataLoader I always get an error:

RuntimeError: Internal: src/trainer_interface.cc(429) [!sentences_.empty()]

(Full stacktrace you can find at the end of this post).

My code is:

from fastai.text.all import *

path = '/kaggle/input/nlp-test-5'    # contains single folder called 'unsup'

files = partial(get_text_files, folders=['unsup'])

tokenizer = SentencePieceTokenizer(lang='en', special_toks=[])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder('/kaggle/working', tok=tokenizer, is_lm=True),
    get_items=files, 
    splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=64, seq_len=80)

Unfortunatelly it’s print error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[8], line 11
      6 files = partial(get_text_files, folders=['unsup'])
      8 tokenizer = SentencePieceTokenizer(lang='en', special_toks=[])
     10 dls_lm = DataBlock(
---> 11     blocks=TextBlock.from_folder('/kaggle/working', tok=tokenizer, is_lm=True),
     12     get_items=files, 
     13     splitter=RandomSplitter(0.1)
     14 ).dataloaders(path, path=path, bs=64, seq_len=80)
     16 learn = language_model_learner(dls_lm, 
     17                                AWD_LSTM, 
     18                                pretrained=False, 
     19                                drop_mult=0.2, 
     20                                metrics=[accuracy],
     21                                pretrained_fnames=['model']).to_fp16()

File /opt/conda/lib/python3.10/site-packages/fastai/text/data.py:242, in TextBlock.from_folder(cls, path, vocab, is_lm, seq_len, backwards, min_freq, max_vocab, **kwargs)
    238 @classmethod
    239 @delegates(Tokenizer.from_folder, keep=True)
    240 def from_folder(cls, path, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, **kwargs):
    241     "Build a `TextBlock` from a `path`"
--> 242     return cls(Tokenizer.from_folder(path, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,
    243                backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)

File /opt/conda/lib/python3.10/site-packages/fastai/text/core.py:281, in Tokenizer.from_folder(cls, path, tok, rules, **kwargs)
    279 path = Path(path)
    280 if tok is None: tok = WordTokenizer()
--> 281 output_dir = tokenize_folder(path, tok=tok, rules=rules, **kwargs)
    282 res = cls(tok, counter=load_pickle(output_dir/fn_counter_pkl),
    283           lengths=load_pickle(output_dir/fn_lengths_pkl), rules=rules, mode='folder')
    284 res.path,res.output_dir = path,output_dir

File /opt/conda/lib/python3.10/site-packages/fastai/text/core.py:186, in tokenize_folder(path, extensions, folders, output_dir, skip_if_exists, **kwargs)
    184 files = get_files(path, extensions=extensions, recurse=True, folders=folders)
    185 def _f(i,output_dir): return output_dir/files[i].relative_to(path)
--> 186 return _tokenize_files(_f, files, path, skip_if_exists=skip_if_exists, **kwargs)

File /opt/conda/lib/python3.10/site-packages/fastai/text/core.py:169, in _tokenize_files(func, files, path, output_dir, output_names, n_workers, rules, tok, encoding, skip_if_exists)
    166 rules = partial(Path.read_text, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy()))
    168 lengths,counter = {},Counter()
--> 169 for i,tok in parallel_tokenize(files, tok, rules, n_workers=n_workers):
    170     out = func(i,output_dir)
    171     out.mk_write(' '.join(tok), encoding=encoding)

File /opt/conda/lib/python3.10/site-packages/fastai/text/core.py:150, in parallel_tokenize(items, tok, rules, n_workers, **kwargs)
    148 "Calls optional `setup` on `tok` before launching `TokenizeWithRules` using `parallel_gen"
    149 if tok is None: tok = WordTokenizer()
--> 150 if hasattr(tok, 'setup'): tok.setup(items, rules)
    151 return parallel_gen(TokenizeWithRules, items, tok=tok, rules=rules, n_workers=n_workers, **kwargs)

File /opt/conda/lib/python3.10/site-packages/fastai/text/core.py:369, in SentencePieceTokenizer.setup(self, items, rules)
    367     for t in progress_bar(maps(*rules, items), total=len(items), leave=False):
    368         f.write(f'{t}\n')
--> 369 sp_model = self.train(raw_text_path)
    370 self.tok = SentencePieceProcessor()
    371 self.tok.Load(str(sp_model))

File /opt/conda/lib/python3.10/site-packages/fastai/text/core.py:353, in SentencePieceTokenizer.train(self, raw_text_path)
    351 vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
    352 spec_tokens = ['\u2581'+s for s in self.special_toks]
--> 353 SentencePieceTrainer.Train(" ".join([
    354     f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
    355     f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
    356     f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2",
    357     f"--user_defined_symbols={','.join(spec_tokens)} --hard_vocab_limit=false"]))
    358 raw_text_path.unlink()
    359 return self.cache_dir/'spm.model'

File /opt/conda/lib/python3.10/site-packages/sentencepiece/__init__.py:989, in SentencePieceTrainer.Train(arg, logstream, **kwargs)
    986 @staticmethod
    987 def Train(arg=None, logstream=None, **kwargs):
    988   with _LogStream(ostream=logstream):
--> 989     SentencePieceTrainer._Train(arg=arg, **kwargs)

File /opt/conda/lib/python3.10/site-packages/sentencepiece/__init__.py:945, in SentencePieceTrainer._Train(arg, **kwargs)
    943 """Train Sentencepiece model. Accept both kwargs and legacy string arg."""
    944 if arg is not None and type(arg) is str:
--> 945   return SentencePieceTrainer._TrainFromString(arg)
    947 def _encode(value):
    948   """Encode value to CSV.."""

File /opt/conda/lib/python3.10/site-packages/sentencepiece/__init__.py:923, in SentencePieceTrainer._TrainFromString(arg)
    921 @staticmethod
    922 def _TrainFromString(arg):
--> 923     return _sentencepiece.SentencePieceTrainer__TrainFromString(arg)

RuntimeError: Internal: src/trainer_interface.cc(429) [!sentences_.empty()]

If I throw out SentencePieceTokenizer , the dataloader has been created properly and show_batch() function prints tokens.

My input is several txt files, each containing several sentences - each sentence ends with a dot (.) and is put on a separate line. The newline character is the UNIX character LF.
None of the files are larger than 300B.

Structure of input data:

obraz

Can anyone help me with this issue? I tried google it but there is only few links about sentences_.empty() and none of them help me