Thanks so much Jeremy and Rob. from AWS clip command I am having some issues. But here is the content of the notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.learner import *
import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling
from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *
import dill as pickle
import pandas as pd
import numpy as np
** Warning: no model found for ‘en’**
** Only loading the ‘en’ tokenizer.**
In [2]:
PATH=‘data/spooky-author-identification/’
%ls {PATH}
models/ test.csv tmp/ train.zip
sample_submission.csv testData.txt train.csv
sample_submission.zip test.zip trainData.txt
In [3]:
def save_data(df, file_train):
trainData =""
for idx, row in df.iterrows():
data = row[‘text’]
if trainData == “”:
trainData= data
else :
trainData=trainData + " " + data
file_train.write(trainData)
file_train.close()
return trainData
In [4]:
file_train= open(f’{PATH}trainData.txt’,‘w’)
In [5]:
df_train = pd.read_csv(f’{PATH}train.csv’)
In [6]:
train_data= save_data(df_train,file_train)
In [7]:
df_test = pd.read_csv(f’{PATH}test.csv’)
In [8]:
file_test= open(f’{PATH}testData.txt’,‘w’)
In [9]:
test_data= save_data(df_test,file_test)
In [ ]:
’ ‘.join(spacy_tok(train_data))
In [11]:
TEXT = data.Field(lower=True, tokenize=spacy_tok)
In [12]:
TRN_PATH = ‘trainData.txt’
VAL_PATH = ‘testData.txt’
TRN = f’{PATH}trainData.txt’
VAL = f’{PATH}testData.txt’
In [13]:
VAL_PATH
Out[13]:
‘testData.txt’
In [14]:
bs=2; bptt=70
In [15]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
In [16]:
md = LanguageModelData(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)
In [17]:
pickle.dump(TEXT, open(f’{PATH}models/TEXT.pkl’,‘wb’))
In [18]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)
Out[18]:
(4250, 4930, 1, 595187)
In [19]:
TEXT.vocab.itos[:12]
Out[19]:
[’’, ‘’, ‘,’, ‘the’, ‘of’, ‘.’, ‘and’, ‘to’, ‘i’, ‘a’, ‘in’, ‘was’]
In [20]:
‘stoi’: ‘string to int’
TEXT.vocab.stoi[‘the’]
Out[20]:
3
In [21]:
md.trn_ds[0].text[:12]
Out[21]:
[‘this’,
‘process’,
’,’,
‘however’,
’,’,
‘afforded’,
‘me’,
‘no’,
‘means’,
‘of’,
‘ascertaining’,
‘the’]
In [22]:
TEXT.numericalize([md.trn_ds[0].text[:12]])
Out[22]:
Variable containing:
31
3075
2
153
2
1385
27
42
304
4
0
3
[torch.cuda.LongTensor of size 12x1 (GPU 0)]
In [ ]:
next(iter(md.trn_dl))
In [24]:
em_sz = 200 # size of each embedding vector
nh = 50 # number of hidden activations per layer
nl = 3 # number of layers
In [25]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
In [26]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3
In [ ]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=1)
In [ ]:
learner.save_encoder(‘adam1_enc’)
In [ ]:
learner.load_encoder(‘adam1_enc’)
In [ ]:
TEXT = pickle.load(open(f’{PATH}models/TEXT.pkl’,‘rb’))
In [ ]:
IMDB_LABEL = data.Field(sequential=False)
In [ ]:
m=learner.model
ss=""". So, it wasn’t quite was I was expecting, but I really liked it anyway! The best"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
’ '.join(s[0])
In [ ]:
Set batch size to 1
m[0].bs=1
Turn off dropout
m.eval()
Reset hidden state
m.reset()
Get predictions from model
res,*_ = m(t)
Put the batch size back to what it was
m[0].bs=bs
In [ ]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]
In [ ]:
print(ss,"\n")
for i in range(50):
n=res[-1].topk(2)[1]
n = n[1] if n.data[0]==0 else n[0]
print(TEXT.vocab.itos[n.data[0]], end=’ ‘)
res,*_ = m(n[0].unsqueeze(0))
print(’…’)
In [22]:
TEXT = pickle.load(open(f’{PATH}models/TEXT.pkl’,‘rb’))
In [ ]:
TEXT.vocab.itos
In [18]:
import spacy
spacy_en = spacy.load(‘en’)
def tokenizer(text): # create a tokenizer function
return [tok.text for tok in spacy_en.tokenizer(text)]
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=150)
LABEL = data.Field(sequential=False, use_vocab=False)
Warning: no model found for ‘en’
Only loading the 'en' tokenizer.
In [19]:
train, val, test = data.TabularDataset.splits(
** PATH, train=‘train.csv’,**
** validation=‘test.csv’, test=‘test.csv’, format=‘csv’,**
** fields=[(‘Text’, TEXT), (‘Label’, LABEL)])**
In [27]:
splits=data.TabularDataset.splits(
** PATH, train=‘train.csv’,**
** validation=‘test.csv’, test=‘test.csv’, format=‘csv’,**
** fields=[(‘Text’, TEXT))**
In [28]:
TEXT = pickle.load(open(f’{PATH}models/TEXT.pkl’,‘rb’))
In [29]:
LABEL = data.Field(sequential=False, use_vocab=True)
In [30]:
splits = data.TabularDataset.splits(
** PATH, train=‘train.csv’,**
** validation=‘test.csv’, test=‘test.csv’, format=‘csv’,**
** fields=[(‘text’, TEXT), (‘label’, LABEL)])**
In [23]:
TEXT.init_token
Out[23]:
<bound method Field.build_vocab of <torchtext.data.field.Field object at 0x7f068117a438>>
In [24]:
** trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs)**
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in ()
----> 1 trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
ValueError: too many values to unpack (expected 2)
In [31]:
train_iter, val_iter, test_iter = torchtext.data.Iterator.splits(
** (train, val, test), sort_key=lambda x: len(x.Text),**
** batch_sizes=(32, 256, 256))**
In [32]:
def from_splits(cls, path, splits, bs, text_name=‘text’, label_name=‘label’):
text_fld = splits[0].fields[text_name]
print(text_fld )
label_fld = splits[0].fields[label_name]
print(label_fld)
label_fld.build_vocab(splits[0])
print(splits[0])
#trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
trn_iter, val_iter, test_iter = torchtext.data.Iterator.splits(
(train, val, test), sort_key=lambda x: len(x.Text), batch_sizes=(32, 256, 256))
trn_dl = TextDataLoader(trn_iter, text_name, label_name)
val_dl = TextDataLoader(val_iter, text_name, label_name)
obj = TextData.from_dls(path, trn_dl, val_dl)
obj.bs = bs
obj.pad_idx = text_fld.vocab.stoi[text_fld.pad_token]
obj.nt = len(text_fld.vocab)
obj.c = len(label_fld.vocab)
return obj
In [33]:
md2=from_splits(TEXT, PATH, splits, bs, text_name=‘text’, label_name=‘label’)
<torchtext.data.field.Field object at 0x7f06802f8b70>
<torchtext.data.field.Field object at 0x7f067f49c160>
<torchtext.data.dataset.TabularDataset object at 0x7f068392c748>
In [34]:
md2.c
Out[34]:
19581
In [35]:
md2.nt
Out[35]:
4930
In [84]:
??TextData.from_dls
In [72]:
??TextData.from_splits
In [36]:
md3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, **
** dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
In [37]:
md3.summary
Out[37]:
<bound method Learner.summary of SequentialRNN (
** (0): MultiBatchRNN (**
** (encoder): Embedding(4930, 200, padding_idx=1)**
** (rnns): ModuleList (**
** (0): WeightDrop (**
** (module): LSTM(200, 50, dropout=0.3)**
** )**
** (1): WeightDrop (**
** (module): LSTM(50, 50, dropout=0.3)**
** )**
** (2): WeightDrop (**
** (module): LSTM(50, 200, dropout=0.3)**
** )**
** )**
** (dropouti): LockedDropout (**
** )**
** (dropouth): LockedDropout (**
** )**
** )**
** (1): PoolingLinearClassifier (**
** (decoder): Linear (600 -> 19581)**
** (dropout): LockedDropout (**
** )**
** )**
)>
In [40]:
md3.fit(1e01, 1, metrics=[accuracy])
** 0%| | 0/611 [00:00<?, ?it/s]**
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in ()
----> 1 md3.fit(1e01, 1, metrics=[accuracy])
~/fastai/courses/dl1/fastai/learner.py in fit(self, lrs, n_cycle, wds, kwargs)
** 97 self.sched = None
** 98 layer_opt = self.get_layer_opt(lrs, wds)**
—> 99 self.fit_gen(self.model, self.data, layer_opt, n_cycle, kwargs)
** 100 **
** 101 def lr_find(self, start_lr=1e-5, end_lr=10, wds=None):
~/fastai/courses/dl1/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, metrics, callbacks, kwargs)
** 87 n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
** 88 fit(model, data, n_epoch, layer_opt.opt, self.crit,**
—> 89 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, kwargs)
** 90 **
** 91 def get_layer_groups(self): return self.models.get_layer_groups()
~/fastai/courses/dl1/fastai/model.py in fit(model, data, epochs, opt, crit, metrics, callbacks, kwargs)
** 80 stepper.reset(True)
** 81 t = tqdm(iter(data.trn_dl), leave=False, total=len(data.trn_dl))**
—> 82 for (x,y) in t:
** 83 batch_num += 1*
** 84 loss = stepper.step(V(x),V(y))**
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/tqdm/_tqdm.py in iter(self)
** 951 “”", fp_write=getattr(self.fp, ‘write’, sys.stderr.write))**
** 952 **
–> 953 for obj in iterable:
** 954 yield obj**
** 955 # Update and possibly print the progressbar.**
~/fastai/courses/dl1/fastai/dataset.py in next(self)
** 219 if self.i>=len(self.dl): raise StopIteration**
** 220 self.i+=1**
–> 221 return next(self.it)
** 222 **
** 223 @property**
~/fastai/courses/dl1/fastai/nlp.py in iter(self)
** 213 it = iter(self.src)**
** 214 for i in range(len(self)):**
–> 215 b = next(it)
** 216 yield getattr(b, self.x_fld), getattr(b, self.y_fld)**
** 217 **
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/iterator.py in iter(self)
** 176 minibatch.sort(key=self.sort_key, reverse=True)**
** 177 yield Batch(minibatch, self.dataset, self.device,**
–> 178 self.train)
** 179 if not self.repeat:**
** 180 raise StopIteration**
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/batch.py in init(self, data, dataset, device, train)
** 20 if field is not None:**
** 21 batch = [x.dict[name] for x in data]**
—> 22 setattr(self, name, field.process(batch, device=device, train=train))
** 23 **
** 24 @classmethod**
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in process(self, batch, device, train)
** 182 “”"**
** 183 padded = self.pad(batch)**
–> 184 tensor = self.numericalize(padded, device=device, train=train)
** 185 return tensor**
** 186 **
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in numericalize(self, arr, device, train)
** 274 if self.use_vocab:**
** 275 if self.sequential:**
–> 276 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
** 277 else:**
** 278 arr = [self.vocab.stoi[x] for x in arr]**
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in (.0)
** 274 if self.use_vocab:**
** 275 if self.sequential:**
–> 276 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
** 277 else:**
** 278 arr = [self.vocab.stoi[x] for x in arr]**
~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in (.0)
** 274 if self.use_vocab:**
** 275 if self.sequential:**
–> 276 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
** 277 else:**
** 278 arr = [self.vocab.stoi[x] for x in arr]**
AttributeError: ‘Field’ object has no attribute ‘vocab’