Lesson 3 In-Class Discussion ✅

ricknta · November 27, 2018, 10:59pm

Can someone tell me what’s wrong with this? I get

TypeError: must be str, not int

from

.label_from_df(cols='label')

I get the same error if I use (cols=3), which works in the IMDB nb.

data = (TextList.from_csv(path, 'fake_or_real_news.csv', col='text')
                .random_split_by_pct(0.2)
                .label_from_df(cols='label')
                .databunch())

TypeError                                 Traceback (most recent call last)
/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
   1011         try:
-> 1012             result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
   1013         except TypeError:

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
    204     if use_numexpr:
--> 205         return _evaluate(op, op_str, a, b, **eval_kwargs)
    206     return _evaluate_standard(op, op_str, a, b)

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
    119     if result is None:
--> 120         result = _evaluate_standard(op, op_str, a, b)
    121 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs)
     64     with np.errstate(all='ignore'):
---> 65         return op(a, b)
     66 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in radd(left, right)
    112 def radd(left, right):
--> 113     return right + left
    114 

TypeError: must be str, not int

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
   1032             with np.errstate(all='ignore'):
-> 1033                 return na_op(lvalues, rvalues)
   1034         except Exception:

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
   1022                 mask = notna(x)
-> 1023                 result[mask] = op(x[mask], y)
   1024 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in radd(left, right)
    112 def radd(left, right):
--> 113     return right + left
    114 

TypeError: must be str, not int

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-62-09c1b499b3ab> in <module>
      1 data = (TextList.from_csv(path, 'fake_or_real_news.csv', col='text')
      2                 .random_split_by_pct(0.2)
----> 3                 .label_from_df(cols='label')
      4                 .databunch())

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
    316             self.valid = fv(*args, **kwargs)
    317             self.__class__ = LabelLists
--> 318             self.process()
    319             return self
    320         return _inner

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in process(self)
    360     def process(self):
    361         xp,yp = self.get_processors()
--> 362         for i,ds in enumerate(self.lists): ds.process(xp, yp, filter_missing_y=i==0)
    363         return self
    364 

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in process(self, xp, yp, filter_missing_y)
    428             filt = array([o is None for o in self.y])
    429             if filt.sum()>0: self.x,self.y = self.x[~filt],self.y[~filt]
--> 430         self.x.process(xp)
    431         return self
    432 

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in process(self, processor)
     58         if processor is not None: self.processor = processor
     59         self.processor = listify(self.processor)
---> 60         for p in self.processor: p.process(self)
     61         return self
     62 

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/text/data.py in process(self, ds)
    275     def process_one(self, item):  return self.tokenizer._process_all_1([item])[0]
    276     def process(self, ds):
--> 277         ds.items = _join_texts(ds.items, self.mark_fields)
    278         tokens = []
    279         for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/text/data.py in _join_texts(texts, mark_fields)
    334     if is1d(texts): texts = texts[:,None]
    335     df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
--> 336     text_col = f'{BOS} {FLD} {1} ' + df[0] if mark_fields else  f'{BOS} ' + df[0]
    337     for i in range(1,len(df.columns)):
    338         text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(left, right)
   1067             rvalues = rvalues.values
   1068 
-> 1069         result = safe_na_op(lvalues, rvalues)
   1070         return construct_result(left, result,
   1071                                 index=left.index, name=res_name, dtype=None)

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
   1035             if is_object_dtype(lvalues):
   1036                 return libalgos.arrmap_object(lvalues,
-> 1037                                               lambda x: op(x, rvalues))
   1038             raise
   1039 

pandas/_libs/algos_common_helper.pxi in pandas._libs.algos.arrmap_object()

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x)
   1035             if is_object_dtype(lvalues):
   1036                 return libalgos.arrmap_object(lvalues,
-> 1037                                               lambda x: op(x, rvalues))
   1038             raise
   1039 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in radd(left, right)
    111 
    112 def radd(left, right):
--> 113     return right + left
    114 
    115 

TypeError: must be str, not int

sam2 · November 27, 2018, 11:41pm

@ricknta

Does your csv file have a first row that labels the columns as ‘text’ and ‘label’ ? Check once.

I am guessing because if result is None:

ricknta · November 27, 2018, 11:59pm

Yes, first row has those labels.

ricknta · November 29, 2018, 2:21am

My data looks like this:

I don’t see why that would be a problem but am not sure.

I thought this could be a data problem of some sort since the exact same syntax works fine on the IMDB data, so I cleaned up a subset of the data (it looked like the csv file had some problems) but still get similar errors.

ricknta · November 29, 2018, 2:33am

With the cleaned-up data, I get the same error (TypeError: must be str, not int) but now apparently from the next line (.label_from_df):

data = (TextList.from_csv(path, 'fake_or_real_news_clean_4000-2.csv', col='text')
                .random_split_by_pct(0.2)
                .label_from_df(cols=3)
                .databunch())

TypeError                                 Traceback (most recent call last)
/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
   1011         try:
-> 1012             result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
   1013         except TypeError:

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
    204     if use_numexpr:
--> 205         return _evaluate(op, op_str, a, b, **eval_kwargs)
    206     return _evaluate_standard(op, op_str, a, b)

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
    119     if result is None:
--> 120         result = _evaluate_standard(op, op_str, a, b)
    121 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs)
     64     with np.errstate(all='ignore'):
---> 65         return op(a, b)
     66 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in radd(left, right)
    112 def radd(left, right):
--> 113     return right + left
    114 

TypeError: must be str, not int

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
   1032             with np.errstate(all='ignore'):
-> 1033                 return na_op(lvalues, rvalues)
   1034         except Exception:

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
   1022                 mask = notna(x)
-> 1023                 result[mask] = op(x[mask], y)
   1024 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in radd(left, right)
    112 def radd(left, right):
--> 113     return right + left
    114 

TypeError: must be str, not int

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-36-5c6735438b12> in <module>
      1 data = (TextList.from_csv(path, 'fake_or_real_news_clean_4000-2.csv', col='text')
      2                 .random_split_by_pct(0.2)
----> 3                 .label_from_df(cols=3)
      4                 .databunch())

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
    316             self.valid = fv(*args, **kwargs)
    317             self.__class__ = LabelLists
--> 318             self.process()
    319             return self
    320         return _inner

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in process(self)
    360     def process(self):
    361         xp,yp = self.get_processors()
--> 362         for i,ds in enumerate(self.lists): ds.process(xp, yp, filter_missing_y=i==0)
    363         return self
    364 

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in process(self, xp, yp, filter_missing_y)
    428             filt = array([o is None for o in self.y])
    429             if filt.sum()>0: self.x,self.y = self.x[~filt],self.y[~filt]
--> 430         self.x.process(xp)
    431         return self
    432 

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/data_block.py in process(self, processor)
     58         if processor is not None: self.processor = processor
     59         self.processor = listify(self.processor)
---> 60         for p in self.processor: p.process(self)
     61         return self
     62 

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/text/data.py in process(self, ds)
    275     def process_one(self, item):  return self.tokenizer._process_all_1([item])[0]
    276     def process(self, ds):
--> 277         ds.items = _join_texts(ds.items, self.mark_fields)
    278         tokens = []
    279         for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):

/opt/conda/envs/fastai/lib/python3.6/site-packages/fastai/text/data.py in _join_texts(texts, mark_fields)
    334     if is1d(texts): texts = texts[:,None]
    335     df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
--> 336     text_col = f'{BOS} {FLD} {1} ' + df[0] if mark_fields else  f'{BOS} ' + df[0]
    337     for i in range(1,len(df.columns)):
    338         text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(left, right)
   1067             rvalues = rvalues.values
   1068 
-> 1069         result = safe_na_op(lvalues, rvalues)
   1070         return construct_result(left, result,
   1071                                 index=left.index, name=res_name, dtype=None)

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
   1035             if is_object_dtype(lvalues):
   1036                 return libalgos.arrmap_object(lvalues,
-> 1037                                               lambda x: op(x, rvalues))
   1038             raise
   1039 

pandas/_libs/algos_common_helper.pxi in pandas._libs.algos.arrmap_object()

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x)
   1035             if is_object_dtype(lvalues):
   1036                 return libalgos.arrmap_object(lvalues,
-> 1037                                               lambda x: op(x, rvalues))
   1038             raise
   1039 

/opt/conda/envs/fastai/lib/python3.6/site-packages/pandas/core/ops.py in radd(left, right)
    111 
    112 def radd(left, right):
--> 113     return right + left
    114 
    115 

TypeError: must be str, not int

ricknta · November 29, 2018, 2:37am

Also if I try .label_for_lm with the cleaned-up data:

data_lm = (TextList.from_csv(path, 'fake_or_real_news_clean_4000-2.csv', col='text')
            .random_split_by_pct(0.2)
           #We randomly split and keep 10% for validation
            .label_for_lm()
           #We want to do a language model so we label accordingly
            .databunch())
data_lm.save('tmp_lm')

Then I once again get the error at the previous line (.random_split_by_pct):

During handling of the above exception, another exception occurred:

<ipython-input-34-6e7bbfaafc36> in <module>
      1 data_lm = (TextList.from_csv(path, 'fake_or_real_news_clean_4000-2.csv', col='text')
----> 2             .random_split_by_pct(0.2)
      3            #We randomly split and keep 10% for validation
      4             .label_for_lm()
      5            #We want to do a language model so we label accordingly

ricknta · November 29, 2018, 2:39am

I don’t really know how to read the exception traceback, so I don’t understand what’s going on.

@lesscomfortable - Any chance you could take a look at this? Am I missing something obvious?

ricknta · November 29, 2018, 4:51am

Now I got this to work by changing the cleaned-up data file - removing 2 columns and moving the label col to pos 0:

but I have no idea why that fixed the problem! Help!

lesscomfortable · November 30, 2018, 8:01pm

Hey! Can you try df['Unnamed: 0']=df['Unnamed: 0'].astype(int) + 1 on your original table? This basically casts every item in this column to an integer.

ricknta · December 1, 2018, 3:13am

Thanks! I don’t think that Unnamed col was a problem - at that point the problem with creating the databunch was fixed; that Unnamed col was just a leftover empty col from when I was manipulating the csv in Excel.

Now I’m working with the df and everything runs OK, but the TextLMDataBunch seems to be reading labels instead of the text :

df = pd.read_csv(path/'fake_or_real_news.csv', usecols=["label", "text"])[["label", "text"]]
df.head(10)

data_lm = (TextList.from_df(df, col='text')
                .random_split_by_pct(0.2)
                .label_for_lm()
                .databunch())
data_lm.save('tmp_lm')

data_lm = TextLMDataBunch.load(path, 'tmp_lm')

Why would it do that??

lesscomfortable · December 1, 2018, 4:17am

Can you try running this instead?

data_lm = (TextList.from_df(df, cols=1)
                .random_split_by_pct(0.2)
                .label_for_lm()
                .databunch())
data_lm.save('tmp_lm')

data_lm = TextLMDataBunch.load(path, 'tmp_lm')

ricknta · December 1, 2018, 4:56am

Thanks, I tried that (very hopeful!) but got almost the same result - slight differences but same basic pattern:

shyampagadi · December 1, 2018, 7:42am

Hi,
I am trying to create a Image Data Bunch form a CSV file and trying to display sample images, receiving below error, can someone please help.

data = ImageDataBunch.from_csv(path, csv_labels=‘train.csv’,folder=‘train’,tfms=tfms)
data.show_batch(rows=2, figsize=(9,7))

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 465 and 498 in dimension 2 at /pytorch/aten/src/TH/generic/THTensorMoreMath.cpp:1325

lesscomfortable · December 3, 2018, 5:40pm

What about:

data_lm = (TextList.from_df(df, cols=‘text’)
.random_split_by_pct(0.2)
.label_for_lm()
.databunch())
data_lm.save(‘tmp_lm’)

data_lm = TextLMDataBunch.load(path, ‘tmp_lm’)

prratek · December 3, 2018, 8:07pm

I’m having some trouble labeling my images using the data block API. Since it’s a multi-class classification problem where most classes have just one example, I want to duplicate images from the underrepresented classes. I now have a DataFrame with image names and labels, including duplicates. My images are in path/train. Here’s my code:

src = (ImageItemList.from_df(df=train_df, path=path, cols='Name', folder='train')
       .split_by_valid_func(lambda o: o in val_n)
       .label_from_df(cols='Id'))

However, label_from_df throws the following error:
IndexError: index 0 is out of bounds for axis 0 with size 0

Any thoughts? Here’s the full traceback:

gist.github.com

https://gist.github.com/prratek/deb8c0627afb1df590d57aa38c50e60d

datablock_label.py

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-65-a21b1f7faa1d> in <module>
      2 src = (ImageItemList.from_df(df=train_df, path=path, cols='Name', folder='train')
      3        .split_by_valid_func(lambda o: o in val_n)
----> 4        .label_from_df(cols='Id'))

/opt/anaconda3/lib/python3.6/site-packages/fastai/data_block.py in _inner(*args, **kwargs)
    346             self.train = ft(*args, **kwargs)
    347             assert isinstance(self.train, LabelList)

This file has been truncated. show original

ricknta · December 3, 2018, 11:39pm

Thanks, but that doesn’t work, either! Same basic pattern:

lesscomfortable · December 4, 2018, 1:05am

What happens if you run:

data_lm = (TextList.from_df(df, cols=[0,1])
.random_split_by_pct(0.2)
.label_for_lm()
.databunch())
data_lm.save(‘tmp_lm’)

data_lm = TextLMDataBunch.load(path, ‘tmp_lm’)

and

data_lm = (TextList.from_df(df, cols=[‘text’,‘label’])
.random_split_by_pct(0.2)
.label_for_lm()
.databunch())
data_lm.save(‘tmp_lm’)

data_lm = TextLMDataBunch.load(path, ‘tmp_lm’)

ricknta · December 4, 2018, 1:40am

Yes! Both worked. So it looks like, for some reason, I have to specify both cols, maybe because the df has a few more cols I’m not using? Does this make sense? Want to make sure I learn from it! thanks

lesscomfortable · December 4, 2018, 1:52am

I don’t really know what’s going on. This might give us more info. Try:

data_lm = (TextList.from_df(df, cols=[0])
.random_split_by_pct(0.2)
.label_for_lm()
.databunch())
data_lm.save(‘tmp_lm’)

data_lm = TextLMDataBunch.load(path, ‘tmp_lm’)

and

data_lm = (TextList.from_df(df, cols=[‘label’])
.random_split_by_pct(0.2)
.label_for_lm()
.databunch())
data_lm.save(‘tmp_lm’)

data_lm = TextLMDataBunch.load(path, ‘tmp_lm’)

wyquek · December 4, 2018, 2:26am

I suspect you have an index column that took up column 0, hence pushing [‘label’] to column 1 and
[‘text’] to column 2

Maybe try removing the index?