Hi. I’m having this weird error while using sentencepiece on a notebook that I previously ran successfully some weeks ago. Here is everything I’ve done and the stack trace
tok = SentencePieceTokenizer
tweet_lm = DataBlock(
blocks=TextBlock.from_df(text_cols='safe_text', is_lm=True, tok_func=tok),
get_x=ColReader('text'),
splitter=RandomSplitter(valid_pct=0.15, seed=42)
)
df = pd.concat([train_df, test_df])
dls = tweet_lm.dataloaders(source=df, bs=int(bs), seq_len=int(seq_len))
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-18-124939045eb2> in <module>()
----> 1 dls = tweet_lm.dataloaders(source=df, bs=int(bs), seq_len=int(seq_len))
17 frames
/usr/local/lib/python3.6/dist-packages/fastai2/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
105
106 def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 107 dsets = self.datasets(source)
108 kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
109 return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)
/usr/local/lib/python3.6/dist-packages/fastai2/data/block.py in datasets(self, source, verbose)
102 splits = (self.splitter or RandomSplitter())(items)
103 pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 104 return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
105
106 def dataloaders(self, source, path='.', verbose=False, **kwargs):
/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
283 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
284 super().__init__(dl_type=dl_type)
--> 285 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
286 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
287
/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in <listcomp>(.0)
283 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
284 super().__init__(dl_type=dl_type)
--> 285 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
286 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
287
/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
45 return x
46
---> 47 res = super().__call__(*((x,) + args), **kwargs)
48 res._newchk = 0
49 return res
/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
221 if do_setup:
222 pv(f"Setting up {self.tfms}", verbose)
--> 223 self.setup(train_setup=train_setup)
224
225 def _new(self, items, split_idx=None, **kwargs):
/usr/local/lib/python3.6/dist-packages/fastai2/data/core.py in setup(self, train_setup)
237
238 def setup(self, train_setup=True):
--> 239 self.tfms.setup(self, train_setup)
240 if len(self) != 0:
241 x = super().__getitem__(0) if self.splits is None else super().__getitem__(self.splits[0])[0]
/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in setup(self, items, train_setup)
179 tfms = self.fs[:]
180 self.fs.clear()
--> 181 for t in tfms: self.add(t,items, train_setup)
182
183 def add(self,t, items=None, train_setup=False):
/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in add(self, t, items, train_setup)
182
183 def add(self,t, items=None, train_setup=False):
--> 184 t.setup(items, train_setup)
185 self.fs.append(t)
186
/usr/local/lib/python3.6/dist-packages/fastcore/transform.py in setup(self, items, train_setup)
76 def setup(self, items=None, train_setup=False):
77 train_setup = train_setup if self.train_setup is None else self.train_setup
---> 78 return self.setups(getattr(items, 'train', items) if train_setup else items)
79
80 def _call(self, fn, x, split_idx=None, **kwargs):
/usr/local/lib/python3.6/dist-packages/fastcore/dispatch.py in __call__(self, *args, **kwargs)
96 if not f: return args[0]
97 if self.inst is not None: f = MethodType(f, self.inst)
---> 98 return f(*args, **kwargs)
99
100 def __get__(self, inst, owner):
/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in setups(self, dsets)
284 def setups(self, dsets):
285 if not self.mode == 'df' or not isinstance(dsets.items, pd.DataFrame): return
--> 286 dsets.items,count = tokenize_df(dsets.items, self.text_cols, rules=self.rules, **self.kwargs)
287 if self.counter is None: self.counter = count
288 return dsets
/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in tokenize_df(df, text_cols, n_workers, rules, mark_fields, tok_func, res_col_name, **tok_kwargs)
214 rules = L(ifnone(rules, defaults.text_proc_rules.copy()))
215 texts = _join_texts(df[text_cols], mark_fields=mark_fields)
--> 216 outputs = L(parallel_tokenize(texts, tok_func, rules, n_workers=n_workers, **tok_kwargs)
217 ).sorted().itemgot(1)
218
/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in parallel_tokenize(items, tok_func, rules, as_gen, n_workers, **tok_kwargs)
140 def parallel_tokenize(items, tok_func, rules, as_gen=False, n_workers=defaults.cpus, **tok_kwargs):
141 "Calls a potential setup on `tok_func` before launching `TokenizeBatch` in parallel"
--> 142 if hasattr(tok_func, 'setup'): tok_kwargs = tok_func(**tok_kwargs).setup(items, rules)
143 return parallel_gen(TokenizeBatch, items, as_gen=as_gen, tok_func=tok_func,
144 rules=rules, n_workers=n_workers, **tok_kwargs)
/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in setup(self, items, rules)
365 for t in progress_bar(maps(*rules, items), total=len(items), leave=False):
366 f.write(f'{t}\n')
--> 367 sp_model = self.train(raw_text_path)
368 self.tok = SentencePieceProcessor()
369 self.tok.Load(str(sp_model))
/usr/local/lib/python3.6/dist-packages/fastai2/text/core.py in train(self, raw_text_path)
353 f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
354 f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2",
--> 355 f"--user_defined_symbols={','.join(spec_tokens)}"]))
356 raw_text_path.unlink()
357 return self.cache_dir/'spm.model'
/usr/local/lib/python3.6/dist-packages/sentencepiece.py in Train(arg, **kwargs)
402 """Train Sentencepiece model. Accept both kwargs and legacy string arg."""
403 if arg is not None and type(arg) is str:
--> 404 return SentencePieceTrainer._TrainFromString(arg)
405
406 def _encode(value):
/usr/local/lib/python3.6/dist-packages/sentencepiece.py in _TrainFromString(arg)
380 @staticmethod
381 def _TrainFromString(arg):
--> 382 return _sentencepiece.SentencePieceTrainer__TrainFromString(arg)
383
384 @staticmethod
OSError: Not found: unknown field name "minloglevel" in TrainerSpec.
Has anyone else experienced this and what was the fix you used.