I’ve been trying to create a simple custom tokenizer that can parse strings of genes so I can then feed that into a language model. It seems to work but then when I feed that into a dataloader it fails…
class GenomicTokenizer(BaseTokenizer):
def __init__(self, split_char='', lang='en', ngram=5, stride=2):
self.split_char=split_char
self.lang = lang
self.ngram = ngram
self.stride = stride
def tokenizer(self, t):
t = t.upper()
if self.ngram == 1:
toks = list(t)
else:
toks = [t[i:i+self.ngram] for i in range(0, len(t), self.stride) if len(t[i:i+self.ngram]) == self.ngram]
if len(toks[-1]) < self.ngram:
toks = toks[:-1]
return toks
def __call__(self, items):
print("items:"+str(items))
for o in items: yield o
def add_special_cases(self, toks):
pass
print(GenomicTokenizer().tokenizer("catgcattagttattaatagtgatgcntg"))
#returns ['CATGC', 'TGCAT', 'CATTA', 'TTAGT', 'AGTTA', 'TTATT', 'ATTAA', 'TAATA', 'ATAGT', 'AGTGA', 'TGATG', 'ATGCN', 'GCNTG']
dummy_df=pd.DataFrame([['catgcattagttattaatagtgatgcntg'],
['gctggatggtttgggacatgatggtttgggacatgatggtttgggacatg'],
['nnccgggctgtagctacacatacataca'],
['gcggagatgaagagccctac']],
columns=['sequence'])
dummy_df['text'] = dummy_df['sequence']
lm_db = DataBlock(
# blocks=TextBlock.from_df('text', is_lm=True, tok=SubwordTokenizer(vocab_sz=20)),
blocks=TextBlock.from_df('text', is_lm=True, tok=GenomicTokenizer()),
get_x=ColReader('text'),
splitter=RandomSplitter(0.1)
)
lm_dl = lm_db.dataloaders(dummy_df, bs=2)
lm_dl.show_batch(max_n=2)
The dataloader call gives this output and error. I couldn’t find anywhere an explanation of what should be going into the call function so I’m a bit lost what I should do to resolve it.
items:<map object at 0x7f0a1b0241f0>
items:<map object at 0x7f0a1b024190>
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
items:['xxbos xxbos▁gctggatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatg']
items:['xxbos xxbos▁catgcattagttattaatagtgatgcntg']
items:['xxbos xxbos▁gcggagatgaagag▁xxrep▁3▁c▁tac']
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
items:['xxbos xxbos▁gctggatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatg']
items:['xxbos xxbos▁catgcattagttattaatagtgatgcntg']
items:['xxbos xxbos▁nncc▁xxrep▁3▁g▁ctgtagctacacatacataca']
Could not do one pass in your dataloader, there is something wrong in it
items:['xxbos xxbos▁gcggagatgaagag▁xxrep▁3▁c▁tac']
items:['xxbos xxbos▁catgcattagttattaatagtgatgcntg']
items:['xxbos xxbos▁gctggatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatgatgg▁xxrep▁3▁t▁xxrep▁3▁g▁acatg']
items:['xxbos xxbos▁gcggagatgaagag▁xxrep▁3▁c▁tac']
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-610183fa1154> in <module>
40
41 lm_dl = lm_db.dataloaders(dummy_df, bs=2)
---> 42 lm_dl.show_batch(max_n=2)
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/core.py in show_batch(self, b, max_n, ctxs, show, unique, **kwargs)
98 old_get_idxs = self.get_idxs
99 self.get_idxs = lambda: Inf.zeros
--> 100 if b is None: b = self.one_batch()
101 if not show: return self._pre_show_batch(b, max_n=max_n)
102 show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in one_batch(self)
135 def one_batch(self):
136 if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
--> 137 with self.fake_l.no_multiproc(): res = first(self)
138 if hasattr(self, 'it'): delattr(self, 'it')
139 return res
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/utils.py in first(x)
194 def first(x):
195 "First element of `x`, or None if missing"
--> 196 try: return next(iter(x))
197 except StopIteration: return None
198
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in __iter__(self)
101 self.randomize()
102 self.before_iter()
--> 103 for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
104 if self.device is not None: b = to_device(b, self.device)
105 yield self.after_batch(b)
/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
361
362 def __next__(self):
--> 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \
/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
--> 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)
/opt/conda/envs/fastai/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
32 raise StopIteration
33 else:
---> 34 data = next(self.dataset_iter)
35 return self.collate_fn(data)
36
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in create_batches(self, samps)
110 self.it = iter(self.dataset) if self.dataset is not None else None
111 res = filter(lambda o:o is not None, map(self.do_item, samps))
--> 112 yield from map(self.do_batch, self.chunkify(res))
113
114 def new(self, dataset=None, cls=None, **kwargs):
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastcore/utils.py in chunked(it, chunk_sz, drop_last, n_chunks)
351 if not isinstance(it, Iterator): it = iter(it)
352 while True:
--> 353 res = list(itertools.islice(it, chunk_sz))
354 if res and (len(res)==chunk_sz or not drop_last): yield res
355 if len(res)<chunk_sz: return
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/data/load.py in do_item(self, s)
123 def prebatched(self): return self.bs is None
124 def do_item(self, s):
--> 125 try: return self.after_item(self.create_item(s))
126 except SkipItemException: return None
127 def chunkify(self, b): return b if self.prebatched else chunked(b, self.bs, self.drop_last)
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/text/data.py in create_item(self, seq)
98 st = (seq%self.bs)*self.bl + (seq//self.bs)*self.seq_len
99 txt = self.chunks[st : st+sl+1]
--> 100 return LMTensorText(txt[:-1]),txt[1:]
101
102 @delegates(TfmdDL.new)
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/torch_core.py in __new__(cls, x, **kwargs)
269 class TensorBase(Tensor):
270 def __new__(cls, x, **kwargs):
--> 271 res = cast(tensor(x), cls)
272 if kwargs: res._meta = kwargs
273 return res
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/torch_core.py in tensor(x, *rest, **kwargs)
122 # if isinstance(x, (tuple,list)) and len(x)==0: return tensor(0)
123 res = (x if isinstance(x, Tensor)
--> 124 else torch.tensor(x, **kwargs) if isinstance(x, (tuple,list))
125 else _array2tensor(x) if isinstance(x, ndarray)
126 else as_tensor(x.values, **kwargs) if isinstance(x, (pd.Series, pd.DataFrame))
/opt/conda/envs/fastai/lib/python3.8/site-packages/fastai/torch_core.py in _f(self, *args, **kwargs)
298 def _f(self, *args, **kwargs):
299 cls = self.__class__
--> 300 res = getattr(super(TensorBase, self), fn)(*args, **kwargs)
301 return retain_type(res, self, copy_meta=True)
302 return _f
TypeError: only integer tensors of a single element can be converted to an index
I’m hoping this would be a simple problem for someone who knows what they are doing