Fastai v2 text

The issue may be that numericalize’s decodes changed. See here:

# Original

def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != PAD)

# Now
def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)

But that’s it as far as major differences between the versions in regards to the related text files. (pad_tok defaults to None now)

1 Like

Thanks, looking forward!

Hopefully self.pad_tok does the trick!

@muellerzr and @morgan. I found the issue.

Numericalize output has changed to “L”, while show_batch stayed the same, expecting a TitledStr.

In fastai2 0.0.8, each s in samples in the show_batch function is a tuple with these characteristics:

print(type(s), len(s), type(s[0]), type(s[1]))
<class 'tuple'> 2 <class 'fastai2.torch_core.TitledStr'> <class 'fastai2.data.transforms.Category'> 

In fastai2 0.0.17 (and also 18):

print(type(s), len(s), type(s[0]), type(s[1]))
<class 'tuple'> 2 <class 'fastcore.foundation.L'> <class 'fastai2.data.transforms.Category'>

So truncate does not work because it expects a “TitledStr”, not “L”. So I changed show_ batch and now it works:

From (not working):

@typedispatch
def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))
    return ctxs

To (working):

@typedispatch
def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))     
    samples = L((TitledStr(" ".join(s[0])).truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))
    return ctxs

Numericalize in 0.0.8:

class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if hasattr(dsets, 'counter') else Counter(p for o in dsets for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return TitledStr(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

Numericalize in 0.0.18:

class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None, pad_tok=None):
        store_attr(self, 'vocab,min_freq,max_vocab,special_toks,pad_tok')
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
            if self.special_toks is None and hasattr(dsets, 'special_toks'):
                self.special_toks = dsets.special_toks
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)
2 Likes

Nice one! Worth a PR for show_batch?

I think so. But I am not used to github and the PR process (my bad)!

Yeah I’m still shaky at it, this is my bible every time : Submitting PR to fastai V2

2 Likes

The rules don’t seem to be what’s causing the issue. I’m struggling to create a faketokenizer that consistently deals with text that has been tokenized via tokenize_df and via external libraries etc.

I have a notebook which shows my attempts (I’m probably missing something obvious here). I’m not sure how common this use case will be, I only started trying as I saw the noop suggestion mentioned in the docs.

Currently stuck with KeyError: new_zeros when trying to convert my multi-category text classifier. Any ideas?

x=learn_ent
orig_bs = x.dls[0].bs
x.dls[0].bs=1
dummy_inp = next(iter(x.dls[0]))
x.dls[0].bs = orig_bs
torch.onnx.export(x.model, dummy_inp[:-1], 'text_classifier_v3.onnx')
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-39-60ac0a786093> in <module>
      7 dummy_inp = next(iter(x.dls[0]))
      8 x.dls[0].bs = orig_bs
----> 9 torch.onnx.export(x.model, dummy_inp[:-1], 'text_classifier_v3.onnx')

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/__init__.py in export(model, args, f, export_params, verbose, training, input_names, output_names, aten, export_raw_ir, operator_export_type, opset_version, _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
    146                         operator_export_type, opset_version, _retain_param_name,
    147                         do_constant_folding, example_outputs,
--> 148                         strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
    149 
    150 

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in export(model, args, f, export_params, verbose, training, input_names, output_names, aten, export_raw_ir, operator_export_type, opset_version, _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
     64             _retain_param_name=_retain_param_name, do_constant_folding=do_constant_folding,
     65             example_outputs=example_outputs, strip_doc_string=strip_doc_string,
---> 66             dynamic_axes=dynamic_axes, keep_initializers_as_inputs=keep_initializers_as_inputs)
     67 
     68 

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, example_outputs, propagate, opset_version, _retain_param_name, do_constant_folding, strip_doc_string, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size)
    414                                                         example_outputs, propagate,
    415                                                         _retain_param_name, do_constant_folding,
--> 416                                                         fixed_batch_size=fixed_batch_size)
    417 
    418         # TODO: Don't allocate a in-memory string for the protobuf

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _model_to_graph(model, args, verbose, training, input_names, output_names, operator_export_type, example_outputs, propagate, _retain_param_name, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size)
    294     graph = _optimize_graph(graph, operator_export_type,
    295                             _disable_torch_constant_prop=_disable_torch_constant_prop,
--> 296                             fixed_batch_size=fixed_batch_size, params_dict=params_dict)
    297 
    298     if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict)
    133         torch._C._jit_pass_erase_number_types(graph)
    134 
--> 135         graph = torch._C._jit_pass_onnx(graph, operator_export_type)
    136         torch._C._jit_pass_lint(graph)
    137 

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/__init__.py in _run_symbolic_function(*args, **kwargs)
    177 def _run_symbolic_function(*args, **kwargs):
    178     from torch.onnx import utils
--> 179     return utils._run_symbolic_function(*args, **kwargs)
    180 
    181 

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _run_symbolic_function(g, n, inputs, env, operator_export_type)
    654                                   "torch.onnx.symbolic_opset{}.{} does not exist"
    655                                   .format(op_name, opset_version, op_name))
--> 656                 op_fn = sym_registry.get_registered_op(op_name, '', opset_version)
    657                 return op_fn(g, *inputs, **attrs)
    658 

~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/symbolic_registry.py in get_registered_op(opname, domain, version)
     89         warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
     90     global _registry
---> 91     return _registry[(domain, version)][opname]

KeyError: 'new_zeros'

Hello,

someone knows what is a LM_Dataset and a LM_Sampler in the notebook train_wt2.py?

IIRC you cannot ONNX ULMFiT nor the unet OOTB because of their forward functions. I saw this issue with fastinference. (There is no solution currently)

1 Like

Bummer. Any ideas on model serving ULMFiT for large batches? TorchServe or something else.

Not really, besides stepping away from fastai per-say (like I did with fastinference, it helped some but not much)

Good to know. Going to give Torchserve another go using their custom handler. I’ll let you know if it works

2 Likes

I got a “key error” for “val_res”, and i guess it come from the new commit. As indicated in the updated text.data, the following code need to be implement in my own notebook:

Pass the training dataset text lengths to SortedDL

srtd_dl=partial(SortedDL, res = train_text_lens)

Pass the validation dataset text lengths

dl_kwargs = [{},{‘val_res’: val_text_lens}]

init our Datasets

dsets = Datasets(…)

init our Dataloaders

dls = dsets.dataloaders(…,dl_type = srtd_dl, dl_kwargs = dl_kwargs)

My question is how to get train_text_lens and valit_text_lens? use _get_lengths? Why not integrated this into SortedDL implementation?

I’m also getting that KeyError: 'val_res' error message running the 01_intro notebook in the fastbook repo after purchasing the new O’reilly book.

Here’s the line that is causing the error:

dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')

Here’s the stack trace:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-15-abc40112d96e> in <module>
----> 1 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')

~/fastai2/fastai2/text/data.py in from_folder(cls, path, train, valid, valid_pct, seed, vocab, text_vocab, is_lm, tok_tfm, seq_len, backwards, **kwargs)
    229                            splitter=splitter,
    230                            get_y=None if is_lm else parent_label)
--> 231         return cls.from_dblock(dblock, path, path=path, seq_len=seq_len, **kwargs)
    232 
    233     @classmethod

~/fastai2/fastai2/data/core.py in from_dblock(cls, dblock, source, path, bs, val_bs, shuffle_train, device, **kwargs)
    165     @classmethod
    166     def from_dblock(cls, dblock, source, path='.',  bs=64, val_bs=None, shuffle_train=True, device=None, **kwargs):
--> 167         return dblock.dataloaders(source, path=path, bs=bs, val_bs=val_bs, shuffle_train=shuffle_train, device=device, **kwargs)
    168 
    169     _docs=dict(__getitem__="Retrieve `DataLoader` at `i` (`0` is training, `1` is validation)",

~/fastai2/fastai2/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
    107         dsets = self.datasets(source)
    108         kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
--> 109         return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)
    110 
    111     _docs = dict(new="Create a new `DataBlock` with other `item_tfms` and `batch_tfms`",

~/fastai2/fastai2/data/core.py in dataloaders(self, bs, val_bs, shuffle_train, n, path, dl_type, dl_kwargs, device, **kwargs)
    201                      **merge(kwargs, dl_kwargs[0]))
    202         dls = [dl] + [dl.new(self.subset(i), bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False,
--> 203                              n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
    204         return self._dbunch_type(*dls, path=path, device=device)
    205 

~/fastai2/fastai2/data/core.py in <listcomp>(.0)
    201                      **merge(kwargs, dl_kwargs[0]))
    202         dls = [dl] + [dl.new(self.subset(i), bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False,
--> 203                              n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
    204         return self._dbunch_type(*dls, path=path, device=device)
    205 

~/fastai2/fastai2/text/data.py in new(self, dataset, **kwargs)
    184     @delegates(TfmdDL.new)
    185     def new(self, dataset=None, **kwargs):
--> 186         if kwargs['val_res'] is not None: res = kwargs['val_res']
    187         else: res = self.res if dataset is None else None
    188         return super().new(dataset=dataset, res=res, **kwargs)

KeyError: 'val_res'

I’m using the latest fastai2 version from the git repository. Does anyone have any work arounds?

you can just remove the following part for the time being:
–> 186 if kwargs[‘val_res’] is not None: res = kwargs[‘val_res’]
187 else:

Sorry I introduced that bug here

Waiting on this issue to be addressed before I can push a fix

val_res fix submitted here now: https://github.com/fastai/fastai2/pull/435

well done, thanks.

Should be pushed now