I am having issues getting a text model up and running in fastai v2. I am worked with protein sequences, hence I need to define a custom tokenizer.
from fastai2.basics import *
from fastai2.text.all import *
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
defaults.text_spec_tok = [PAD]
class MolTokenizer(BaseTokenizer):
def __init__(self, split_char=' '):
self.split_char = ' '
def __call__(self, items):
return ( ['GO']+list(t.upper() )+['END'] for t in items)
I then begin with the following
bs = 64
corpus_train = pd.read_csv('./processed/train.csv.gzip',index_col=None, compression='gzip')
corpus_valid = pd.read_csv('./processed/valid.csv.gzip',index_col=None, compression='gzip')
corpus_train['is_valid'] = False
corpus_valid['is_valid'] = True
corpus = corpus_train.append(corpus_valid, ignore_index=True)
path = './test/'
df_tok, count = tokenize_df(corpus, 'sequence', rules=[], tok_func=partial(MolTokenizer))
dls_lm = TextDataLoaders.from_df(df_tok, path=path, text_vocab=make_vocab(count,min_freq=1), text_col='text', is_lm=True, valid_col='is_valid')
everything checks out for my vocab
dls_lm.train_ds.vocab
['xxpad', 'L', 'A', 'G', 'V', 'E', 'S', 'I', 'K', 'R', 'D', 'T', 'P', 'N', 'Q', 'F', 'Y', 'M', 'H', 'C', 'W', 'GO', 'END', 'xxfake']
and df_tok looks good
df_tok.text.head(2)
0 [GO, M, A, N, Y, T, A, A, D, I, K, A, L, R, E, R, T, G, A, G, M, M, D, V, K, K, A, L, D, E, A, N, G, D, A, E, K, A, I, E, I, I, R, I, K, G, L, K, G, A, T, K, R, E, G, R, S, T, A, E, G, L, V, A, A, K, V, N, G, G, V, G, V, M, I, E, V, N, C, E, T, D, F, V, A, K, A, D, K, F, I, Q, L, A, D, K, V, L, N, V, ...]
1 [GO, M, P, K, S, R, R, A, V, S, L, S, V, L, I, G, A, V, I, A, A, L, A, G, A, L, I, A, V, T, V, P, A, R, P, N, R, P, E, A, D, R, E, A, L, W, K, I, V, H, D, R, C, E, F, G, Y, R, R, T, G, A, Y, A, P, C, T, F, V, D, E, Q, S, G, T, A, L, Y, K, A, D, F, D, P, Y, Q, F, L, L, I, P, L, A, R, I, T, G, I, E, D, ...]
However, there is something going on either during numericalization or generating the batches
xx, yy = dls_lm.one_batch()
xx[:5]
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
device='cuda:1')
All my data gets numericalized to a zero token. Does anyone see what I am doing wrong?