I see 0.18 was released in April, and there have been a bunch of commits since then, maybe pulling the latest version might help? (or maybe theres a reason you’re can’t upgrade…?)
Hmmm. I am using the pip package. Will try to upgrade directly from the repo.
Thanks.
By the way, nice work with fasthugs. Congrats. I am now trying to explore and later convert RoBert (Bert finetuning + lstm of long document segments) to fastai. Will keep you informed if I advance.
Just for information, upgraded and still receiving the message “AttributeError: ‘L’ object has no attribute ‘truncate’”
The issue may be that numericalize’s decodes changed. See here:
# Original
def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != PAD)
# Now
def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)
But that’s it as far as major differences between the versions in regards to the related text files. (pad_tok
defaults to None
now)
Thanks, looking forward!
Hopefully self.pad_tok does the trick!
@muellerzr and @morgan. I found the issue.
Numericalize output has changed to “L”, while show_batch stayed the same, expecting a TitledStr.
In fastai2 0.0.8, each s in samples in the show_batch function is a tuple with these characteristics:
print(type(s), len(s), type(s[0]), type(s[1]))
<class 'tuple'> 2 <class 'fastai2.torch_core.TitledStr'> <class 'fastai2.data.transforms.Category'>
In fastai2 0.0.17 (and also 18):
print(type(s), len(s), type(s[0]), type(s[1]))
<class 'tuple'> 2 <class 'fastcore.foundation.L'> <class 'fastai2.data.transforms.Category'>
So truncate does not work because it expects a “TitledStr”, not “L”. So I changed show_ batch and now it works:
From (not working):
@typedispatch
def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
display_df(pd.DataFrame(ctxs))
return ctxs
To (working):
@typedispatch
def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
samples = L((TitledStr(" ".join(s[0])).truncate(trunc_at),*s[1:]) for s in samples)
ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
display_df(pd.DataFrame(ctxs))
return ctxs
Numericalize in 0.0.8:
class Numericalize(Transform):
"Reversible transform of tokenized texts to numericalized ids"
def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
def setups(self, dsets):
if dsets is None: return
if self.vocab is None:
count = dsets.counter if hasattr(dsets, 'counter') else Counter(p for o in dsets for p in o)
self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})
def encodes(self, o): return TensorText(tensor([self.o2i [o_] for o_ in o]))
def decodes(self, o): return TitledStr(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))
Numericalize in 0.0.18:
class Numericalize(Transform):
"Reversible transform of tokenized texts to numericalized ids"
def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None, pad_tok=None):
store_attr(self, 'vocab,min_freq,max_vocab,special_toks,pad_tok')
self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
def setups(self, dsets):
if dsets is None: return
if self.vocab is None:
count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
if self.special_toks is None and hasattr(dsets, 'special_toks'):
self.special_toks = dsets.special_toks
self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)
self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})
def encodes(self, o): return TensorText(tensor([self.o2i [o_] for o_ in o]))
def decodes(self, o): return L(self.vocab[o_] for o_ in o if self.vocab[o_] != self.pad_tok)
Nice one! Worth a PR for show_batch
?
I think so. But I am not used to github and the PR process (my bad)!
Yeah I’m still shaky at it, this is my bible every time : Submitting PR to fastai V2
The rules don’t seem to be what’s causing the issue. I’m struggling to create a faketokenizer that consistently deals with text that has been tokenized via tokenize_df
and via external libraries etc.
I have a notebook which shows my attempts (I’m probably missing something obvious here). I’m not sure how common this use case will be, I only started trying as I saw the noop
suggestion mentioned in the docs.
Currently stuck with KeyError: new_zeros when trying to convert my multi-category text classifier. Any ideas?
x=learn_ent
orig_bs = x.dls[0].bs
x.dls[0].bs=1
dummy_inp = next(iter(x.dls[0]))
x.dls[0].bs = orig_bs
torch.onnx.export(x.model, dummy_inp[:-1], 'text_classifier_v3.onnx')
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-39-60ac0a786093> in <module>
7 dummy_inp = next(iter(x.dls[0]))
8 x.dls[0].bs = orig_bs
----> 9 torch.onnx.export(x.model, dummy_inp[:-1], 'text_classifier_v3.onnx')
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/__init__.py in export(model, args, f, export_params, verbose, training, input_names, output_names, aten, export_raw_ir, operator_export_type, opset_version, _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
146 operator_export_type, opset_version, _retain_param_name,
147 do_constant_folding, example_outputs,
--> 148 strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
149
150
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in export(model, args, f, export_params, verbose, training, input_names, output_names, aten, export_raw_ir, operator_export_type, opset_version, _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs)
64 _retain_param_name=_retain_param_name, do_constant_folding=do_constant_folding,
65 example_outputs=example_outputs, strip_doc_string=strip_doc_string,
---> 66 dynamic_axes=dynamic_axes, keep_initializers_as_inputs=keep_initializers_as_inputs)
67
68
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, example_outputs, propagate, opset_version, _retain_param_name, do_constant_folding, strip_doc_string, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size)
414 example_outputs, propagate,
415 _retain_param_name, do_constant_folding,
--> 416 fixed_batch_size=fixed_batch_size)
417
418 # TODO: Don't allocate a in-memory string for the protobuf
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _model_to_graph(model, args, verbose, training, input_names, output_names, operator_export_type, example_outputs, propagate, _retain_param_name, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size)
294 graph = _optimize_graph(graph, operator_export_type,
295 _disable_torch_constant_prop=_disable_torch_constant_prop,
--> 296 fixed_batch_size=fixed_batch_size, params_dict=params_dict)
297
298 if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict)
133 torch._C._jit_pass_erase_number_types(graph)
134
--> 135 graph = torch._C._jit_pass_onnx(graph, operator_export_type)
136 torch._C._jit_pass_lint(graph)
137
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/__init__.py in _run_symbolic_function(*args, **kwargs)
177 def _run_symbolic_function(*args, **kwargs):
178 from torch.onnx import utils
--> 179 return utils._run_symbolic_function(*args, **kwargs)
180
181
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/utils.py in _run_symbolic_function(g, n, inputs, env, operator_export_type)
654 "torch.onnx.symbolic_opset{}.{} does not exist"
655 .format(op_name, opset_version, op_name))
--> 656 op_fn = sym_registry.get_registered_op(op_name, '', opset_version)
657 return op_fn(g, *inputs, **attrs)
658
~/anaconda3/envs/fastai2_lm/lib/python3.7/site-packages/torch/onnx/symbolic_registry.py in get_registered_op(opname, domain, version)
89 warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
90 global _registry
---> 91 return _registry[(domain, version)][opname]
KeyError: 'new_zeros'
Hello,
someone knows what is a LM_Dataset
and a LM_Sampler
in the notebook train_wt2.py?
IIRC you cannot ONNX ULMFiT nor the unet OOTB because of their forward functions. I saw this issue with fastinference. (There is no solution currently)
Bummer. Any ideas on model serving ULMFiT for large batches? TorchServe or something else.
Not really, besides stepping away from fastai per-say (like I did with fastinference, it helped some but not much)
Good to know. Going to give Torchserve another go using their custom handler. I’ll let you know if it works
I got a “key error” for “val_res”, and i guess it come from the new commit. As indicated in the updated text.data, the following code need to be implement in my own notebook:
Pass the training dataset text lengths to SortedDL
srtd_dl=partial(SortedDL, res = train_text_lens)
Pass the validation dataset text lengths
dl_kwargs = [{},{‘val_res’: val_text_lens}]
init our Datasets
dsets = Datasets(…)
init our Dataloaders
dls = dsets.dataloaders(…,dl_type = srtd_dl, dl_kwargs = dl_kwargs)
My question is how to get train_text_lens and valit_text_lens? use _get_lengths? Why not integrated this into SortedDL implementation?
I’m also getting that KeyError: 'val_res'
error message running the 01_intro
notebook in the fastbook
repo after purchasing the new O’reilly book.
Here’s the line that is causing the error:
dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
Here’s the stack trace:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-15-abc40112d96e> in <module>
----> 1 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
~/fastai2/fastai2/text/data.py in from_folder(cls, path, train, valid, valid_pct, seed, vocab, text_vocab, is_lm, tok_tfm, seq_len, backwards, **kwargs)
229 splitter=splitter,
230 get_y=None if is_lm else parent_label)
--> 231 return cls.from_dblock(dblock, path, path=path, seq_len=seq_len, **kwargs)
232
233 @classmethod
~/fastai2/fastai2/data/core.py in from_dblock(cls, dblock, source, path, bs, val_bs, shuffle_train, device, **kwargs)
165 @classmethod
166 def from_dblock(cls, dblock, source, path='.', bs=64, val_bs=None, shuffle_train=True, device=None, **kwargs):
--> 167 return dblock.dataloaders(source, path=path, bs=bs, val_bs=val_bs, shuffle_train=shuffle_train, device=device, **kwargs)
168
169 _docs=dict(__getitem__="Retrieve `DataLoader` at `i` (`0` is training, `1` is validation)",
~/fastai2/fastai2/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
107 dsets = self.datasets(source)
108 kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
--> 109 return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)
110
111 _docs = dict(new="Create a new `DataBlock` with other `item_tfms` and `batch_tfms`",
~/fastai2/fastai2/data/core.py in dataloaders(self, bs, val_bs, shuffle_train, n, path, dl_type, dl_kwargs, device, **kwargs)
201 **merge(kwargs, dl_kwargs[0]))
202 dls = [dl] + [dl.new(self.subset(i), bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False,
--> 203 n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
204 return self._dbunch_type(*dls, path=path, device=device)
205
~/fastai2/fastai2/data/core.py in <listcomp>(.0)
201 **merge(kwargs, dl_kwargs[0]))
202 dls = [dl] + [dl.new(self.subset(i), bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False,
--> 203 n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
204 return self._dbunch_type(*dls, path=path, device=device)
205
~/fastai2/fastai2/text/data.py in new(self, dataset, **kwargs)
184 @delegates(TfmdDL.new)
185 def new(self, dataset=None, **kwargs):
--> 186 if kwargs['val_res'] is not None: res = kwargs['val_res']
187 else: res = self.res if dataset is None else None
188 return super().new(dataset=dataset, res=res, **kwargs)
KeyError: 'val_res'
I’m using the latest fastai2 version from the git repository. Does anyone have any work arounds?
you can just remove the following part for the time being:
–> 186 if kwargs[‘val_res’] is not None: res = kwargs[‘val_res’]
187 else: