Error with sentencepiece

Hi. I’m having this weird error while using sentencepiece on a notebook that I previously ran successfully some weeks ago. Here is everything I’ve done and the stack trace

tok = SentencePieceTokenizer

tweet_lm = DataBlock(
            blocks=TextBlock.from_df(text_cols='safe_text', is_lm=True, tok_func=tok),
            get_x=ColReader('text'),
            splitter=RandomSplitter(valid_pct=0.15, seed=42)
            )

df = pd.concat([train_df, test_df])

dls = tweet_lm.dataloaders(source=df, bs=int(bs), seq_len=int(seq_len))
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-18-124939045eb2> in <module>()
----> 1 dls = tweet_lm.dataloaders(source=df, bs=int(bs), seq_len=int(seq_len))

17 frames
/usr/local/lib/python3.6/dist-packages/fastai2/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
    105 
    106     def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 107         dsets = self.datasets(source)
    108         kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
    109         return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

/usr/local/lib/python3.6/dist-packages/fastai2/data/block.py in datasets(self, source, verbose)
    102         splits = (self.splitter or RandomSplitter())(items)
    103         pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 104         return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
    105 
    106     def dataloaders(self, source, path='.', verbose=False, **kwargs):

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
    283     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    284         super().__init__(dl_type=dl_type)
--> 285         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    286         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    287 

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in <listcomp>(.0)
    283     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    284         super().__init__(dl_type=dl_type)
--> 285         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    286         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    287 

/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
     45             return x
     46 
---> 47         res = super().__call__(*((x,) + args), **kwargs)
     48         res._newchk = 0
     49         return res

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
    221         if do_setup:
    222             pv(f"Setting up {self.tfms}", verbose)
--> 223             self.setup(train_setup=train_setup)
    224 
    225     def _new(self, items, split_idx=None, **kwargs):

/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in setup(self, train_setup)
    237 
    238     def setup(self, train_setup=True):
--> 239         self.tfms.setup(self, train_setup)
    240         if len(self) != 0:
    241             x = super().__getitem__(0) if self.splits is None else super().__getitem__(self.splits[0])[0]

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in setup(self, items, train_setup)
    179         tfms = self.fs[:]
    180         self.fs.clear()
--> 181         for t in tfms: self.add(t,items, train_setup)
    182 
    183     def add(self,t, items=None, train_setup=False):

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in add(self, t, items, train_setup)
    182 
    183     def add(self,t, items=None, train_setup=False):
--> 184         t.setup(items, train_setup)
    185         self.fs.append(t)
    186 

/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in setup(self, items, train_setup)
     76     def setup(self, items=None, train_setup=False):
     77         train_setup = train_setup if self.train_setup is None else self.train_setup
---> 78         return self.setups(getattr(items, 'train', items) if train_setup else items)
     79 
     80     def _call(self, fn, x, split_idx=None, **kwargs):

/usr/local/lib/python3.6/dist-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
     96         if not f: return args[0]
     97         if self.inst is not None: f = MethodType(f, self.inst)
---> 98         return f(*args, **kwargs)
     99 
    100     def __get__(self, inst, owner):

/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in setups(self, dsets)
    284     def setups(self, dsets):
    285         if not self.mode == 'df' or not isinstance(dsets.items, pd.DataFrame): return
--> 286         dsets.items,count = tokenize_df(dsets.items, self.text_cols, rules=self.rules, **self.kwargs)
    287         if self.counter is None: self.counter = count
    288         return dsets

/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in tokenize_df(df, text_cols, n_workers, rules, mark_fields, tok_func, res_col_name, **tok_kwargs)
    214     rules = L(ifnone(rules, defaults.text_proc_rules.copy()))
    215     texts = _join_texts(df[text_cols], mark_fields=mark_fields)
--> 216     outputs = L(parallel_tokenize(texts, tok_func, rules, n_workers=n_workers, **tok_kwargs)
    217                ).sorted().itemgot(1)
    218 

/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in parallel_tokenize(items, tok_func, rules, as_gen, n_workers, **tok_kwargs)
    140 def parallel_tokenize(items, tok_func, rules, as_gen=False, n_workers=defaults.cpus, **tok_kwargs):
    141     "Calls a potential setup on `tok_func` before launching `TokenizeBatch` in parallel"
--> 142     if hasattr(tok_func, 'setup'): tok_kwargs = tok_func(**tok_kwargs).setup(items, rules)
    143     return parallel_gen(TokenizeBatch, items, as_gen=as_gen, tok_func=tok_func,
    144                         rules=rules, n_workers=n_workers, **tok_kwargs)

/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in setup(self, items, rules)
    365             for t in progress_bar(maps(*rules, items), total=len(items), leave=False):
    366                 f.write(f'{t}\n')
--> 367         sp_model = self.train(raw_text_path)
    368         self.tok = SentencePieceProcessor()
    369         self.tok.Load(str(sp_model))

/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in train(self, raw_text_path)
    353             f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
    354             f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2",
--> 355             f"--user_defined_symbols={','.join(spec_tokens)}"]))
    356         raw_text_path.unlink()
    357         return self.cache_dir/'spm.model'

/usr/local/lib/python3.6/dist-packages/sentencepiece.py in Train(arg, **kwargs)
    402       """Train Sentencepiece model. Accept both kwargs and legacy string arg."""
    403       if arg is not None and type(arg) is str:
--> 404         return SentencePieceTrainer._TrainFromString(arg)
    405 
    406       def _encode(value):

/usr/local/lib/python3.6/dist-packages/sentencepiece.py in _TrainFromString(arg)
    380     @staticmethod
    381     def _TrainFromString(arg):
--> 382         return _sentencepiece.SentencePieceTrainer__TrainFromString(arg)
    383 
    384     @staticmethod

OSError: Not found: unknown field name "minloglevel" in TrainerSpec.

Has anyone else experienced this and what was the fix you used.

What version of SentencePiece are you using? Looks like the issue is on their side and got addressed: https://github.com/google/sentencepiece/issues/492

Turns out it is a version problem. 0.1.91 throws that error so I downgraded to 0.1.86.
Thanks

2 Likes