Custom Tokenizer not behaving as expected

wgpubs · March 16, 2020, 2:03am

I built a custom tokenizer that works right …

tok = Tokenizer(tokenizer=doc_tok, rules=[])
print(tok('hi there, how is everyoen. cool!'))
# [101, 7632, 2045, 1010, 2129, 2003, 2296, 8913, 2078, 1012, 102, 101, 4658, 999, 102]

Trying to use it as such:

a_tok_tfblock = TransformBlock(type_tfms=Tokenizer(tokenizer=a_tok, rules=[]), 
                         dls_kwargs={ 'before_batch': pad_input_chunk })

b_tok_tfblock = TransformBlock(type_tfms=Tokenizer(tokenizer=b_tok, rules=[]), 
                         dls_kwargs={ 'before_batch': pad_input_chunk })

dblocks = DataBlock(blocks=(a_tok_tfblock, b_tok_tfblock),
                    get_items=get_files, get_x=get_doc_a, get_y=get_doc_b, 
                    splitter = RandomSplitter())

dls = dblocks.dataloaders(DATA_PATH, bs=4, path=PATH)

All seems to work well except that I don’t see 4 items per batch. Instead, I see a list of tensors each with a shape of (4,) …

b = dls.one_batch()
len(b), len(b[0]), len(b[1])
# (2, 423, 35) ... BUT I was expecting 2, 4, 4
b
# ([tensor([101, 101, 101, 101], device='cuda:1'),
#  tensor([2624, 2009, 1006, 2414], device='cuda:1'),
#  tensor([ 5277,  2003, 13229,  1010], device='cuda:1'),
# ...

Can anyone tell me what the heck is going on here and where I went wrong?

sgugger · March 16, 2020, 2:10am

A tokenizer is supposed to return a list of tokens for fastai, not a list of ints. I’d say that’s where your problem comes from.

wgpubs · March 16, 2020, 2:16am

What if my “tokenizer” does both?

I’ve tried putting my code in a custom Transform with an “encodes” method, and getting the same kind of results.

I’ll try moving things around and do the tokenization separately from the numericalization if that is the best and or only option.

wgpubs · March 16, 2020, 5:24am

SOLVED:

But I’m confused as to why.

Anyhow, I tried creating a custom pad_input function with this signature:

def pad_input_bertabs(samples, block_size=512, pad_token_id=0, sep_token_id=102, is_summary:bool=False)

… and the method wouldn’t even get called (no errors, no nada). Changed it to …

def pad_input_bertabs(samples, block_size=100, pad_token_id=999, sep_token_id=999, is_summary=False)

… and all worked perfectly. Only difference was that last argument. Really weird, but it sees the type in there and doesn’t throw an exception, just passes right on by like it wasn’t even there.

So I’m a little bit closer …

I thought having dls_kwargs={ 'before_batch': pad_input_chunk } it would ensure that my tensors were the same size, but apparently this doesn’t even get called before default_collate.

How does the .text package get around this?

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-51-8ce4435141fc> in <module>
----> 1 b = dls.one_batch()

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in one_batch(self)
    129     def one_batch(self):
    130         if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
--> 131         with self.fake_l.no_multiproc(): res = first(self)
    132         if hasattr(self, 'it'): delattr(self, 'it')
    133         return res

~/development/_training/ml/nlp-playground/_libs/fastcore/fastcore/utils.py in first(x)
    174 def first(x):
    175     "First element of `x`, or None if missing"
--> 176     try: return next(iter(x))
    177     except StopIteration: return None
    178 

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in __iter__(self)
     95         self.randomize()
     96         self.before_iter()
---> 97         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
     98             if self.device is not None: b = to_device(b, self.device)
     99             yield self.after_batch(b)

~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
    343 
    344     def __next__(self):
--> 345         data = self._next_data()
    346         self._num_yielded += 1
    347         if self._dataset_kind == _DatasetKind.Iterable and \

~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    383     def _next_data(self):
    384         index = self._next_index()  # may raise StopIteration
--> 385         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    386         if self._pin_memory:
    387             data = _utils.pin_memory.pin_memory(data)

~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     32                 raise StopIteration
     33         else:
---> 34             data = next(self.dataset_iter)
     35         return self.collate_fn(data)
     36 

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in create_batches(self, samps)
    104         self.it = iter(self.dataset) if self.dataset is not None else None
    105         res = filter(lambda o:o is not None, map(self.do_item, samps))
--> 106         yield from map(self.do_batch, self.chunkify(res))
    107 
    108     def new(self, dataset=None, cls=None, **kwargs):

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in do_batch(self, b)
    125     def create_item(self, s):  return next(self.it) if s is None else self.dataset[s]
    126     def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
--> 127     def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
    128     def to(self, device): self.device = device
    129     def one_batch(self):

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in create_batch(self, b)
    124     def retain(self, res, b):  return retain_types(res, b[0] if is_listy(b) else b)
    125     def create_item(self, s):  return next(self.it) if s is None else self.dataset[s]
--> 126     def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
    127     def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
    128     def to(self, device): self.device = device

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in fa_collate(t)
     44     b = t[0]
     45     return (default_collate(t) if isinstance(b, _collate_types)
---> 46             else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
     47             else default_collate(t))
     48 

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in <listcomp>(.0)
     44     b = t[0]
     45     return (default_collate(t) if isinstance(b, _collate_types)
---> 46             else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
     47             else default_collate(t))
     48 

~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in fa_collate(t)
     43 def fa_collate(t):
     44     b = t[0]
---> 45     return (default_collate(t) if isinstance(b, _collate_types)
     46             else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
     47             else default_collate(t))

~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
     53             storage = elem.storage()._new_shared(numel)
     54             out = elem.new(storage)
---> 55         return torch.stack(batch, 0, out=out)
     56     elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
     57             and elem_type.__name__ != 'string_':

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 1338 and 1702 in dimension 1 at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/TH/generic/THTensor.cpp:612

sgugger · March 16, 2020, 8:50am

Anything you pass to before_batch, after_item or after_batch is fed to Pipeline, which means it becomes a Transform, which means the type annotations are used, perhaps in an unintended way.