Custom Tokenizer in DataLoader

I’ve been trying to create a simple custom tokenizer that can parse strings of genes so I can then feed that into a language model. It seems to work but then when I feed that into a dataloader it fails…

class GenomicTokenizer(BaseTokenizer):
    def __init__(self, split_char='', lang='en', ngram=5, stride=2):
        self.split_char=split_char
        self.lang = lang
        self.ngram = ngram
        self.stride = stride
        
    def tokenizer(self, t):
        t = t.upper()
        if self.ngram == 1:
            toks = list(t)
        else:
            toks = [t[i:i+self.ngram] for i in range(0, len(t), self.stride) if len(t[i:i+self.ngram]) == self.ngram]
        if len(toks[-1]) < self.ngram:
            toks = toks[:-1]
        
        return toks
    
    def __call__(self, items):
        print("items:"+str(items))
        for o in items: yield o  
    
    def add_special_cases(self, toks):
        pass

print(GenomicTokenizer().tokenizer("catgcattagttattaatagtgatgcntg"))
#returns ['CATGC', 'TGCAT', 'CATTA', 'TTAGT', 'AGTTA', 'TTATT', 'ATTAA', 'TAATA', 'ATAGT', 'AGTGA', 'TGATG', 'ATGCN', 'GCNTG']

dummy_df=pd.DataFrame([['catgcattagttattaatagtgatgcntg'], 
                    ['gctggatggtttgggacatgatggtttgggacatgatggtttgggacatg'], 
                    ['nnccgggctgtagctacacatacataca'], 
                    ['gcggagatgaagagccctac']], 
                   columns=['sequence'])
dummy_df['text'] = dummy_df['sequence']

lm_db = DataBlock(
#     blocks=TextBlock.from_df('text', is_lm=True, tok=SubwordTokenizer(vocab_sz=20)),
    blocks=TextBlock.from_df('text', is_lm=True, tok=GenomicTokenizer()),
    get_x=ColReader('text'),
    splitter=RandomSplitter(0.1)
)

lm_dl = lm_db.dataloaders(dummy_df, bs=2)
lm_dl.show_batch(max_n=2)

The dataloader call gives this output and error. I couldn’t find anywhere an explanation of what should be going into the call function so I’m a bit lost what I should do to resolve it.

items:<map object at 0x7f0a1b0241f0>
items:<map object at 0x7f0a1b024190>
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
items:['xxbos xxbos▁gctggatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatg']
items:['xxbos xxbos▁catgcattagttattaatagtgatgcntg']
items:['xxbos xxbos▁gcggagatgaagag▁xxrep▁3▁c▁tac']
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
items:['xxbos xxbos▁gctggatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatg']
items:['xxbos xxbos▁catgcattagttattaatagtgatgcntg']
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
Could not do one pass in your dataloader, there is something wrong in it
items:['xxbos xxbos▁gcggagatgaagag▁xxrep▁3▁c▁tac']
items:['xxbos xxbos▁catgcattagttattaatagtgatgcntg']
items:['xxbos xxbos▁gctggatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatg']
items:['xxbos xxbos▁gcggagatgaagag▁xxrep▁3▁c▁tac']
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-610183fa1154> in <module>
     40 
     41 lm_dl = lm_db.dataloaders(dummy_df, bs=2)
---> 42 lm_dl.show_batch(max_n=2)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/core.py in show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
     98             old_get_idxs = self.get_idxs
     99             self.get_idxs = lambda: Inf.zeros
--> 100         if b is None: b = self.one_batch()
    101         if not show: return self._pre_show_batch(b, max_n=max_n)
    102         show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in one_batch(self)
    135     def one_batch(self):
    136         if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
--> 137         with self.fake_l.no_multiproc(): res = first(self)
    138         if hasattr(self, 'it'): delattr(self, 'it')
    139         return res

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/utils.py in first(x)
    194 def first(x):
    195     "First element of `x`, or None if missing"
--> 196     try: return next(iter(x))
    197     except StopIteration: return None
    198 

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in __iter__(self)
    101         self.randomize()
    102         self.before_iter()
--> 103         for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
    104             if self.device is not None: b = to_device(b, self.device)
    105             yield self.after_batch(b)

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
    361 
    362     def __next__(self):
--> 363         data = self._next_data()
    364         self._num_yielded += 1
    365         if self._dataset_kind == _DatasetKind.Iterable and \

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    401     def _next_data(self):
    402         index = self._next_index()  # may raise StopIteration
--> 403         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    404         if self._pin_memory:
    405             data = _utils.pin_memory.pin_memory(data)

/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     32                 raise StopIteration
     33         else:
---> 34             data = next(self.dataset_iter)
     35         return self.collate_fn(data)
     36 

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in create_batches(self, samps)
    110         self.it = iter(self.dataset) if self.dataset is not None else None
    111         res = filter(lambda o:o is not None, map(self.do_item, samps))
--> 112         yield from map(self.do_batch, self.chunkify(res))
    113 
    114     def new(self, dataset=None, cls=None, **kwargs):

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/utils.py in chunked(it, chunk_sz, drop_last, n_chunks)
    351     if not isinstance(it, Iterator): it = iter(it)
    352     while True:
--> 353         res = list(itertools.islice(it, chunk_sz))
    354         if res and (len(res)==chunk_sz or not drop_last): yield res
    355         if len(res)<chunk_sz: return

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in do_item(self, s)
    123     def prebatched(self): return self.bs is None
    124     def do_item(self, s):
--> 125         try: return self.after_item(self.create_item(s))
    126         except SkipItemException: return None
    127     def chunkify(self, b): return b if self.prebatched else chunked(b, self.bs, self.drop_last)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/text/data.py in create_item(self, seq)
     98         st = (seq%self.bs)*self.bl + (seq//self.bs)*self.seq_len
     99         txt = self.chunks[st : st+sl+1]
--> 100         return LMTensorText(txt[:-1]),txt[1:]
    101 
    102     @delegates(TfmdDL.new)

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/torch_core.py in __new__(cls, x, **kwargs)
    269 class TensorBase(Tensor):
    270     def __new__(cls, x, **kwargs):
--> 271         res = cast(tensor(x), cls)
    272         if kwargs: res._meta = kwargs
    273         return res

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/torch_core.py in tensor(x, *rest, **kwargs)
    122     # if isinstance(x, (tuple,list)) and len(x)==0: return tensor(0)
    123     res = (x if isinstance(x, Tensor)
--> 124            else torch.tensor(x, **kwargs) if isinstance(x, (tuple,list))
    125            else _array2tensor(x) if isinstance(x, ndarray)
    126            else as_tensor(x.values, **kwargs) if isinstance(x, (pd.Series, pd.DataFrame))

/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/torch_core.py in _f(self, *args, **kwargs)
    298         def _f(self, *args, **kwargs):
    299             cls = self.__class__
--> 300             res = getattr(super(TensorBase, self), fn)(*args, **kwargs)
    301             return retain_type(res, self, copy_meta=True)
    302         return _f

TypeError: only integer tensors of a single element can be converted to an index

I’m hoping this would be a simple problem for someone who knows what they are doing :smile:

1 Like