How to correctly pad text in dual-text DataBlocks?

neural · October 2, 2020, 6:35am

Hi everyone,

I’m having some issues defining a datablock for a dual-text input. It looks like this is caused by the dual nature of the input, because the same code on a “normal” datablock with a single text input works fine. The code looks as follows:

from fastai.text.all import *
#define char tokenizer
class CharTokenizer():
def init(self, lang = ‘en’, special_tokens = [PAD]):
self.lang = lang
self.special_tokens = special_tokens
def __call__(self, seq):
    return (['xxbos']+list(s) for s in seq)  
tok = Tokenizer(CharTokenizer, rules=)

#Define some sample data
df = pd.DataFrame({‘Textcol1’:[‘ADFTGA’]*10+[‘ADFRRTRRRRGA’]*10+[‘ADFRRTRRRRRRRGA’]*10,
‘Textcol2’:[‘AAAAAG’]*10+[‘ADFCCCCFTFFFFGA’]*10+[‘ADFRRTGADFFFF’]*10,
‘Categorycol’:[1]*10+[0]*20,
})

#shuffle inplace
df = df.sample(len(df))

#define datablock
dblock = DataBlock(blocks = (TextBlock(tok_tfm = tok),
TextBlock(tok_tfm = tok),
CategoryBlock),
               getters   = [ColReader('Textcol1'),
                            ColReader('Textcol2'),
                            ColReader('Categorycol')],
               n_inp     = 2,
               splitter  = RandomSplitter(),)
dls = dblock.dataloaders(df, bs = 16)

dls.show_batch()

This code results in the following stacktrace

RuntimeError Traceback (most recent call last)
in
----> 1 dls.show_batch()

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/core.py in show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
98 old_get_idxs = self.get_idxs
99 self.get_idxs = lambda: Inf.zeros
→ 100 if b is None: b = self.one_batch()
101 if not show: return self._pre_show_batch(b, max_n=max_n)
102 show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in one_batch(self)
134 def one_batch(self):
135 if self.n is not None and len(self)==0: raise ValueError(f’This DataLoader does not contain any batches’)
→ 136 with self.fake_l.no_multiproc(): res = first(self)
137 if hasattr(self, ‘it’): delattr(self, ‘it’)
138 return res

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/utils.py in first(x)
221 def first(x):
222 “First element of x, or None if missing”
→ 223 try: return next(iter(x))
224 except StopIteration: return None
225

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in iter(self)
100 self.before_iter()
101 self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
→ 102 for b in _loadersself.fake_l.num_workers==0:
103 if self.device is not None: b = to_device(b, self.device)
104 yield self.after_batch(b)

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py in next(self)
361
362 def next(self):
→ 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
→ 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
32 raise StopIteration
33 else:
—> 34 data = next(self.dataset_iter)
35 return self.collate_fn(data)
36

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in create_batches(self, samps)
109 self.it = iter(self.dataset) if self.dataset is not None else None
110 res = filter(lambda o:o is not None, map(self.do_item, samps))
→ 111 yield from map(self.do_batch, self.chunkify(res))
112
113 def new(self, dataset=None, cls=None, **kwargs):

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in do_batch(self, b)
130 def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
131 def create_batch(self, b): return (fa_collate,fa_convert)self.prebatched
→ 132 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
133 def to(self, device): self.device = device
134 def one_batch(self):

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in create_batch(self, b)
129 def retain(self, res, b): return retain_types(res, b[0] if is_listy(b) else b)
130 def create_item(self, s): return next(self.it) if s is None else self.dataset[s]
→ 131 def create_batch(self, b): return (fa_collate,fa_convert)self.prebatched
132 def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
133 def to(self, device): self.device = device

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in fa_collate(t)
46 b = t[0]
47 return (default_collate(t) if isinstance(b, _collate_types)
—> 48 else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
49 else default_collate(t))
50

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in (.0)
46 b = t[0]
47 return (default_collate(t) if isinstance(b, _collate_types)
—> 48 else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
49 else default_collate(t))
50

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in fa_collate(t)
45 “A replacement for PyTorch default_collate which maintains types and handles Sequences”
46 b = t[0]
—> 47 return (default_collate(t) if isinstance(b, _collate_types)
48 else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
49 else default_collate(t))

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/utils/collate.py in default_collate(batch)
53 storage = elem.storage().new_shared(numel)
54 out = elem.new(storage)
—> 55 return torch.stack(batch, 0, out=out)
56 elif elem_type.module == ‘numpy’ and elem_type.name != 'str’
57 and elem_type.name != 'string’:

RuntimeError: stack expects each tensor to be equal size, but got [14] at entry 0 and [16] at entry 7

It looks like the padding is not being applied correctly, but I can’t quite see where that would need to be set up. Any help would be greatly appreciated!

Edit: the following is the output of dblock.summary:
Setting-up type transforms pipelines
Collecting items from Textcol1 Textcol2 Categorycol
8 ADFTGA AAAAAG 1
1 ADFTGA AAAAAG 1
25 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
19 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
10 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
5 ADFTGA AAAAAG 1
26 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
17 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
16 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
0 ADFTGA AAAAAG 1
22 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
7 ADFTGA AAAAAG 1
18 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
9 ADFTGA AAAAAG 1
12 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
6 ADFTGA AAAAAG 1
4 ADFTGA AAAAAG 1
28 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
29 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
2 ADFTGA AAAAAG 1
23 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
21 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
13 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
15 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
24 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
27 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
20 ADFRRTRRRRRRRGA ADFRRTGADFFFF 0
14 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
3 ADFTGA AAAAAG 1
11 ADFRRTRRRRGA ADFCCCCFTFFFFGA 0
Found 30 items
2 datasets of sizes 24,6
Setting up Pipeline: ColReader – {‘cols’: ‘Textcol1’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} → Tokenizer → Numericalize
Setting up Pipeline: ColReader – {‘cols’: ‘Textcol2’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} → Tokenizer → Numericalize
Setting up Pipeline: ColReader – {‘cols’: ‘Categorycol’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} → Categorize – {‘vocab’: None, ‘sort’: True, ‘add_na’: False}

Building one sample
Pipeline: ColReader – {‘cols’: ‘Textcol1’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} → Tokenizer → Numericalize
starting from
Textcol1 ADFRRTRRRRRRRGA
Textcol2 ADFRRTGADFFFF
Categorycol 0
Name: 23, dtype: object
applying ColReader – {‘cols’: ‘Textcol1’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} gives
ADFRRTRRRRRRRGA
applying Tokenizer gives
[xxbos, A, D, F, R, R, T, R, R, R, R, R, R, R, G, A]
applying Numericalize gives
TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10])
Pipeline: ColReader – {‘cols’: ‘Textcol2’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} → Tokenizer → Numericalize
starting from
Textcol1 ADFRRTRRRRRRRGA
Textcol2 ADFRRTGADFFFF
Categorycol 0
Name: 23, dtype: object
applying ColReader – {‘cols’: ‘Textcol2’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} gives
ADFRRTGADFFFF
applying Tokenizer gives
[xxbos, A, D, F, R, R, T, G, A, D, F, F, F, F]
applying Numericalize gives
TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9])
Pipeline: ColReader – {‘cols’: ‘Categorycol’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} → Categorize – {‘vocab’: None, ‘sort’: True, ‘add_na’: False}
starting from
Textcol1 ADFRRTRRRRRRRGA
Textcol2 ADFRRTGADFFFF
Categorycol 0
Name: 23, dtype: object
applying ColReader – {‘cols’: ‘Categorycol’, ‘pref’: ‘’, ‘suff’: ‘’, ‘label_delim’: None} gives
0
applying Categorize – {‘vocab’: None, ‘sort’: True, ‘add_na’: False} gives
TensorCategory(0)

Final sample: (TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0))

Setting up after_item: Pipeline: ToTensor
Setting up before_batch: Pipeline: partial
Setting up after_batch: Pipeline:

Building one batch
Applying item_tfms to the first sample:
Pipeline: ToTensor
starting from
(TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0))
applying ToTensor gives
(TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0))

Adding the next 3 samples

Applying before_batch to the list of samples
Pipeline: partial
starting from
[(TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0)), (TensorText([ 2, 10, 11, 12, 13, 14, 10]), TensorText([ 2, 10, 10, 10, 10, 10, 12]), TensorCategory(1)), (TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0)), (TensorText([ 2, 10, 11, 12, 13, 14, 10]), TensorText([ 2, 10, 10, 10, 10, 10, 12]), TensorCategory(1))]
applying partial gives
[(TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0)), (TensorText([ 2, 10, 11, 12, 13, 14, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1]), TensorText([ 2, 10, 10, 10, 10, 10, 12]), TensorCategory(1)), (TensorText([ 2, 10, 11, 12, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 14, 10]), TensorText([ 2, 10, 13, 9, 15, 15, 14, 12, 10, 13, 9, 9, 9, 9]), TensorCategory(0)), (TensorText([ 2, 10, 11, 12, 13, 14, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1]), TensorText([ 2, 10, 10, 10, 10, 10, 12]), TensorCategory(1))]

Collating items in a batch
Error! It’s not possible to collate your items in a batch
Could not collate the 0-th members of your tuples because got the following shapes
torch.Size([16]),torch.Size([16]),torch.Size([16]),torch.Size([16])

neural · October 2, 2020, 12:51pm

Just for completeness: the corresponding single-text DataBlock works fine:

tok1 = Tokenizer(CharTokenizer, rules = )

dblock = DataBlock(blocks = (TextBlock(tok_tfm = tok1),
CategoryBlock),
               getters   = [ColReader('Textcol1'),
                            ColReader('Categorycol')],
               splitter  = RandomSplitter()) 
dls = dblock.dataloaders(df, bs = 16)
dls.show_batch(max_n = 15)
#returns the expected result

bhoomit · November 18, 2020, 7:08am

This commit fixes this issue: https://github.com/fastai/fastai/commit/1f4d74bc9488a13da2ca309a76091023c1931663

Upgrade it to fastai==2.1.5