Invalid literal for int() with base 10: 'd'

I would like to redo the notebook from lesson 4 with my own data. By now I have managed to create and train the language model.

I create the splits:

TWITTER_LABELS = data.Field(sequential=False, use_vocab=False)
splits = TwitterDataset.splits(TEXT, TWITTER_LABELS, PATH_TWITTER, train='trn', test='val')

and the new model:

md2 = TextData.from_splits(PATH_TWITTER, splits, bs, text_name="SentimentText\n", label_name="Sentiment")
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
       dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)

m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f’adam3_20_enc’)

set the new parameters and freeze up to the last layer:

m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])
m3.freeze_to(-1)

But when I now run the fit method, this error appears:

m3.fit(lrs/2, 1, metrics=[accuracy])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-46-5a8d207cc5de> in <module>()
----> 1 m3.fit(lrs/2, 1, metrics=[accuracy])

~/Dokumente/Projekte/fastai/courses/dl1/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
    302         self.sched = None
    303         layer_opt = self.get_layer_opt(lrs, wds)
--> 304         return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
    305 
    306     def warm_up(self, lr, wds=None):

~/Dokumente/Projekte/fastai/courses/dl1/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
    249             metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
    250             swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
--> 251             swa_eval_freq=swa_eval_freq, **kwargs)
    252 
    253     def get_layer_groups(self): return self.models.get_layer_groups()

~/Dokumente/Projekte/fastai/courses/dl1/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, visualize, **kwargs)
    136         if all_val: val_iter = IterBatch(cur_data.val_dl)
    137 
--> 138         for (*x,y) in t:
    139             batch_num += 1
    140             for cb in callbacks: cb.on_batch_begin()

~/anaconda3/envs/fastai/lib/python3.6/site-packages/tqdm/_tqdm.py in __iter__(self)
    929 """, fp_write=getattr(self.fp, 'write', sys.stderr.write))
    930 
--> 931             for obj in iterable:
    932                 yield obj
    933                 # Update and possibly print the progressbar.

~/Dokumente/Projekte/fastai/courses/dl1/fastai/nlp.py in __iter__(self)
    323         it = iter(self.src)
    324         for i in range(len(self)):
--> 325             b = next(it)
    326             yield getattr(b, self.x_fld).data, getattr(b, self.y_fld).data
    327 

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/iterator.py in __iter__(self)
    149                         minibatch.sort(key=self.sort_key, reverse=True)
    150                 yield Batch(minibatch, self.dataset, self.device,
--> 151                             self.train)
    152             if not self.repeat:
    153                 return

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/batch.py in __init__(self, data, dataset, device, train)
     25                 if field is not None:
     26                     batch = [getattr(x, name) for x in data]
---> 27                     setattr(self, name, field.process(batch, device=device, train=train))
     28 
     29     @classmethod

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in process(self, batch, device, train)
    186         """
    187         padded = self.pad(batch)
--> 188         tensor = self.numericalize(padded, device=device, train=train)
    189         return tensor
    190 

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in numericalize(self, arr, device, train)
    304             if not self.sequential:
    305                 arr = [numericalization_func(x) if isinstance(x, six.string_types)
--> 306                        else x for x in arr]
    307             if self.postprocessing is not None:
    308                 arr = self.postprocessing(arr, None, train)

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in <listcomp>(.0)
    304             if not self.sequential:
    305                 arr = [numericalization_func(x) if isinstance(x, six.string_types)
--> 306                        else x for x in arr]
    307             if self.postprocessing is not None:
    308                 arr = self.postprocessing(arr, None, train)

ValueError: invalid literal for int() with base 10: 'd'

So I found the source of the error. I had a bug in the Dataset Class. I have cleaned this up and now get a new error when running the fit method of m3:

Apparently I should provide an int value and the method gets the sentimenttext.

---------------------------------------------------------------------------

ValueError Traceback (most recent call last)
in ()
----> 1 m3.fit(lrs/2, 1, metrics=[accuracy])

~/Dokumente/Projekte/fastai/courses/dl1/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs)
302 self.sched = None
303 layer_opt = self.get_layer_opt(lrs, wds)
–> 304 return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
305
306 def warm_up(self, lr, wds=None):

~/Dokumente/Projekte/fastai/courses/dl1/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs)
249 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
250 swa_model=self.swa_model if use_swa else None, swa_start=swa_start,
–> 251 swa_eval_freq=swa_eval_freq, **kwargs)
252
253 def get_layer_groups(self): return self.models.get_layer_groups()

~/Dokumente/Projekte/fastai/courses/dl1/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, visualize, **kwargs)
136 if all_val: val_iter = IterBatch(cur_data.val_dl)
137
–> 138 for (*x,y) in t:
139 batch_num += 1
140 for cb in callbacks: cb.on_batch_begin()

~/anaconda3/envs/fastai/lib/python3.6/site-packages/tqdm/_tqdm.py in iter(self)
929 “”", fp_write=getattr(self.fp, ‘write’, sys.stderr.write))
930
–> 931 for obj in iterable:
932 yield obj
933 # Update and possibly print the progressbar.

~/Dokumente/Projekte/fastai/courses/dl1/fastai/nlp.py in iter(self)
323 it = iter(self.src)
324 for i in range(len(self)):
–> 325 b = next(it)
326 yield getattr(b, self.x_fld).data, getattr(b, self.y_fld).data
327

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/iterator.py in iter(self)
149 minibatch.sort(key=self.sort_key, reverse=True)
150 yield Batch(minibatch, self.dataset, self.device,
–> 151 self.train)
152 if not self.repeat:
153 return

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/batch.py in init(self, data, dataset, device, train)
25 if field is not None:
26 batch = [getattr(x, name) for x in data]
—> 27 setattr(self, name, field.process(batch, device=device, train=train))
28
29 @classmethod

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in process(self, batch, device, train)
186 “”"
187 padded = self.pad(batch)
–> 188 tensor = self.numericalize(padded, device=device, train=train)
189 return tensor
190

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in numericalize(self, arr, device, train)
304 if not self.sequential:
305 arr = [numericalization_func(x) if isinstance(x, six.string_types)
–> 306 else x for x in arr]
307 if self.postprocessing is not None:
308 arr = self.postprocessing(arr, None, train)

~/anaconda3/envs/fastai/lib/python3.6/site-packages/torchtext/data/field.py in (.0)
304 if not self.sequential:
305 arr = [numericalization_func(x) if isinstance(x, six.string_types)
–> 306 else x for x in arr]
307 if self.postprocessing is not None:
308 arr = self.postprocessing(arr, None, train)

ValueError: invalid literal for int() with base 10: “@ddlovato @selenagomez I wish I could see you guys at the princess protection program premeire tomorrow but I can’t go Oh wel…HAVE FUN!”

Problem solved:

class TwitterDataset(torchtext.data.Dataset):
def __init__(self, path, text_field, label_field, **kwargs):
    datafields = [('Sentiment', TWITTER_LABELS), ('SentimentText\n', TEXT)]
    examples = []
    for label in ['1', '0']:
        fnames = glob(os.path.join(path, label, '*.txt'));
        assert fnames, f"can't find 'yes.txt' or 'no.txt' under {path}/{label}"
        for fname in fnames:
            with open(fname, 'r') as f: 
                text = f.readline()
            examples.append(data.Example.fromlist([label, text], datafields))                    
    super().__init__(examples, datafields, **kwargs)
    
@staticmethod
def sort_key(ex): return len(ex.text)

@classmethod
def splits(cls, text_field, label_field, root='.data',
           train='train', test='test', **kwargs):
    return super().splits(
        root, text_field=text_field, label_field=label_field,
        train=train, validation=None, test=test,  **kwargs)

In the original code this line is:

examples.append(data.Example.fromlist([text, label], datafields))     

I had to change this. Now it works!