The issue may be that numericalize’s decodes changed. See here:

# Original

def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != PAD)

# Now
def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)

But that’s it as far as major differences between the versions in regards to the related text files. (pad_tok defaults to None now)

@muellerzr and @morgan. I found the issue.

Numericalize output has changed to “L”, while show_batch stayed the same, expecting a TitledStr.

In fastai2 0.0.8, each s in samples in the show_batch function is a tuple with these characteristics:

print(type(s), len(s), type(s[0]), type(s[1]))
<class 'tuple'> 2 <class 'fastai2.torch_core.TitledStr'> <class ''> 

In fastai2 0.0.17 (and also 18):

print(type(s), len(s), type(s[0]), type(s[1]))
<class 'tuple'> 2 <class ''> <class ''>

So truncate does not work because it expects a “TitledStr”, not “L”. So I changed show_ batch and now it works:

From (not working):

def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    return ctxs

To (working):

def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))     
    samples = L((TitledStr(" ".join(s[0])).truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    return ctxs

Numericalize in 0.0.8:

class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if hasattr(dsets, 'counter') else Counter(p for o in dsets for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return TitledStr(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

Numericalize in 0.0.18:

class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None, pad_tok=None):
        store_attr(self, 'vocab,min_freq,max_vocab,special_toks,pad_tok')
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
            if self.special_toks is None and hasattr(dsets, 'special_toks'):
                self.special_toks = dsets.special_toks
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)

Yeah I’m still shaky at it, this is my bible every time : Submitting PR to fastai V2


The rules don’t seem to be what’s causing the issue. I’m struggling to create a faketokenizer that consistently deals with text that has been tokenized via tokenize_df and via external libraries etc.

I have a notebook which shows my attempts (I’m probably missing something obvious here). I’m not sure how common this use case will be, I only started trying as I saw the noop suggestion mentioned in the docs.

Currently stuck with KeyError: new_zeros when trying to convert my multi-category text classifier. Any ideas?

orig_bs = x.dls[0].bs
dummy_inp = next(iter(x.dls[0]))
x.dls[0].bs = orig_bs
torch.onnx.export(x.model, dummy_inp[:-1], 'text_classifier_v3.onnx')
KeyError                                  Traceback (most recent call last)
<ipython-input-39-60ac0a786093> in <module>
      7 dummy_inp = next(iter(x.dls[0]))
      8 x.dls[0].bs = orig_bs
----> 9 torch.onnx.export(x.model, dummy_inp[:-1], 'text_classifier_v3.onnx')

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in export(model, args, f, export_params, verbose, training, input_names, output_names, aten, export_raw_ir, operator_export_type, opset_version, _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
    146                         operator_export_type, opset_version, _retain_param_name,
    147                         do_constant_folding, example_outputs,
--> 148                         strip_doc_string, dynamic_axes, keep_initializers_as_inputs)

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in export(model, args, f, export_params, verbose, training, input_names, output_names, aten, export_raw_ir, operator_export_type, opset_version, _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
     64             _retain_param_name=_retain_param_name, do_constant_folding=do_constant_folding,
     65             example_outputs=example_outputs, strip_doc_string=strip_doc_string,
---> 66             dynamic_axes=dynamic_axes, keep_initializers_as_inputs=keep_initializers_as_inputs)

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, example_outputs, propagate, opset_version, _retain_param_name, do_constant_folding, strip_doc_string, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size)
    414                                                         example_outputs, propagate,
    415                                                         _retain_param_name, do_constant_folding,
--> 416                                                         fixed_batch_size=fixed_batch_size)
    418         # TODO: Don't allocate a in-memory string for the protobuf

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in _model_to_graph(model, args, verbose, training, input_names, output_names, operator_export_type, example_outputs, propagate, _retain_param_name, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size)
    294     graph = _optimize_graph(graph, operator_export_type,
    295                             _disable_torch_constant_prop=_disable_torch_constant_prop,
--> 296                             fixed_batch_size=fixed_batch_size, params_dict=params_dict)
    298     if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict)
    133         torch._C._jit_pass_erase_number_types(graph)
--> 135         graph = torch._C._jit_pass_onnx(graph, operator_export_type)
    136         torch._C._jit_pass_lint(graph)

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in _run_symbolic_function(*args, **kwargs)
    177 def _run_symbolic_function(*args, **kwargs):
    178     from torch.onnx import utils
--> 179     return utils._run_symbolic_function(*args, **kwargs)

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in _run_symbolic_function(g, n, inputs, env, operator_export_type)
    654                                   "torch.onnx.symbolic_opset{}.{} does not exist"
    655                                   .format(op_name, opset_version, op_name))
--> 656                 op_fn = sym_registry.get_registered_op(op_name, '', opset_version)
    657                 return op_fn(g, *inputs, **attrs)

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/ in get_registered_op(opname, domain, version)
     89         warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
     90     global _registry
---> 91     return _registry[(domain, version)][opname]

KeyError: 'new_zeros'


someone knows what is a LM_Dataset and a LM_Sampler in the notebook

IIRC you cannot ONNX ULMFiT nor the unet OOTB because of their forward functions. I saw this issue with fastinference. (There is no solution currently)

Bummer. Any ideas on model serving ULMFiT for large batches? TorchServe or something else.

Not really, besides stepping away from fastai per-say (like I did with fastinference, it helped some but not much)

Good to know. Going to give Torchserve another go using their custom handler. I’ll let you know if it works


I got a “key error” for “val_res”, and i guess it come from the new commit. As indicated in the updated, the following code need to be implement in my own notebook:

Pass the training dataset text lengths to SortedDL

srtd_dl=partial(SortedDL, res = train_text_lens)

Pass the validation dataset text lengths

dl_kwargs = [{},{‘val_res’: val_text_lens}]

init our Datasets

dsets = Datasets(…)

init our Dataloaders

dls = dsets.dataloaders(…,dl_type = srtd_dl, dl_kwargs = dl_kwargs)

My question is how to get train_text_lens and valit_text_lens? use _get_lengths? Why not integrated this into SortedDL implementation?

I’m also getting that KeyError: 'val_res' error message running the 01_intro notebook in the fastbook repo after purchasing the new O’reilly book.

Here’s the line that is causing the error:

dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')

Here’s the stack trace:

KeyError                                  Traceback (most recent call last)
<ipython-input-15-abc40112d96e> in <module>
----> 1 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')

~/fastai2/fastai2/text/ in from_folder(cls, path, train, valid, valid_pct, seed, vocab, text_vocab, is_lm, tok_tfm, seq_len, backwards, **kwargs)
    229                            splitter=splitter,
    230                            get_y=None if is_lm else parent_label)
--> 231         return cls.from_dblock(dblock, path, path=path, seq_len=seq_len, **kwargs)
    233     @classmethod

~/fastai2/fastai2/data/ in from_dblock(cls, dblock, source, path, bs, val_bs, shuffle_train, device, **kwargs)
    165     @classmethod
    166     def from_dblock(cls, dblock, source, path='.',  bs=64, val_bs=None, shuffle_train=True, device=None, **kwargs):
--> 167         return dblock.dataloaders(source, path=path, bs=bs, val_bs=val_bs, shuffle_train=shuffle_train, device=device, **kwargs)
    169     _docs=dict(__getitem__="Retrieve `DataLoader` at `i` (`0` is training, `1` is validation)",

~/fastai2/fastai2/data/ in dataloaders(self, source, path, verbose, **kwargs)
    107         dsets = self.datasets(source)
    108         kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
--> 109         return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)
    111     _docs = dict(new="Create a new `DataBlock` with other `item_tfms` and `batch_tfms`",

~/fastai2/fastai2/data/ in dataloaders(self, bs, val_bs, shuffle_train, n, path, dl_type, dl_kwargs, device, **kwargs)
    201                      **merge(kwargs, dl_kwargs[0]))
    202         dls = [dl] + [, bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False,
--> 203                              n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
    204         return self._dbunch_type(*dls, path=path, device=device)

~/fastai2/fastai2/data/ in <listcomp>(.0)
    201                      **merge(kwargs, dl_kwargs[0]))
    202         dls = [dl] + [, bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False,
--> 203                              n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
    204         return self._dbunch_type(*dls, path=path, device=device)

~/fastai2/fastai2/text/ in new(self, dataset, **kwargs)
    184     @delegates(
    185     def new(self, dataset=None, **kwargs):
--> 186         if kwargs['val_res'] is not None: res = kwargs['val_res']
    187         else: res = self.res if dataset is None else None
    188         return super().new(dataset=dataset, res=res, **kwargs)

KeyError: 'val_res'

I’m using the latest fastai2 version from the git repository. Does anyone have any work arounds?

you can just remove the following part for the time being:
–> 186 if kwargs[‘val_res’] is not None: res = kwargs[‘val_res’]
187 else:

Sorry I introduced that bug here

Waiting on this issue to be addressed before I can push a fix

val_res fix submitted here now:

Should be pushed now