SOLVED:
But I’m confused as to why.
Anyhow, I tried creating a custom pad_input
function with this signature:
def pad_input_bertabs(samples, block_size=512, pad_token_id=0, sep_token_id=102, is_summary:bool=False)
… and the method wouldn’t even get called (no errors, no nada). Changed it to …
def pad_input_bertabs(samples, block_size=100, pad_token_id=999, sep_token_id=999, is_summary=False)
… and all worked perfectly. Only difference was that last argument. Really weird, but it sees the type in there and doesn’t throw an exception, just passes right on by like it wasn’t even there.
So I’m a little bit closer …
I thought having dls_kwargs={ 'before_batch': pad_input_chunk }
it would ensure that my tensors were the same size, but apparently this doesn’t even get called before default_collate
.
How does the .text
package get around this?
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-51-8ce4435141fc> in <module>
----> 1 b = dls.one_batch()
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in one_batch(self)
129 def one_batch(self):
130 if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
--> 131 with self.fake_l.no_multiproc(): res = first(self)
132 if hasattr(self, 'it'): delattr(self, 'it')
133 return res
~/development/_training/ml/nlp-playground/_libs/fastcore/fastcore/utils.py in first(x)
174 def first(x):
175 "First element of `x`, or None if missing"
--> 176 try: return next(iter(x))
177 except StopIteration: return None
178
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in __iter__(self)
95 self.randomize()
96 self.before_iter()
---> 97 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
98 if self.device is not None: b = to_device(b, self.device)
99 yield self.after_batch(b)
~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
343
344 def __next__(self):
--> 345 data = self._next_data()
346 self._num_yielded += 1
347 if self._dataset_kind == _DatasetKind.Iterable and \
~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
383 def _next_data(self):
384 index = self._next_index() # may raise StopIteration
--> 385 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
386 if self._pin_memory:
387 data = _utils.pin_memory.pin_memory(data)
~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
32 raise StopIteration
33 else:
---> 34 data = next(self.dataset_iter)
35 return self.collate_fn(data)
36
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in create_batches(self, samps)
104 self.it = iter(self.dataset) if self.dataset is not None else None
105 res = filter(lambda o:o is not None, map(self.do_item, samps))
--> 106 yield from map(self.do_batch, self.chunkify(res))
107
108 def new(self, dataset=None, cls=None, **kwargs):
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in do_batch(self, b)
125 def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
126 def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
--> 127 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
128 def to(self, device): self.device = device
129 def one_batch(self):
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in create_batch(self, b)
124 def retain(self, res, b): return retain_types(res, b[0] if is_listy(b) else b)
125 def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
--> 126 def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
127 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
128 def to(self, device): self.device = device
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in fa_collate(t)
44 b = t[0]
45 return (default_collate(t) if isinstance(b, _collate_types)
---> 46 else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
47 else default_collate(t))
48
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in <listcomp>(.0)
44 b = t[0]
45 return (default_collate(t) if isinstance(b, _collate_types)
---> 46 else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
47 else default_collate(t))
48
~/development/_training/ml/nlp-playground/_libs/fastai2/fastai2/data/load.py in fa_collate(t)
43 def fa_collate(t):
44 b = t[0]
---> 45 return (default_collate(t) if isinstance(b, _collate_types)
46 else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
47 else default_collate(t))
~/anaconda3/envs/playground-nlp/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
53 storage = elem.storage()._new_shared(numel)
54 out = elem.new(storage)
---> 55 return torch.stack(batch, 0, out=out)
56 elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
57 and elem_type.__name__ != 'string_':
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 1338 and 1702 in dimension 1 at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/TH/generic/THTensor.cpp:612