AttributeError: 'Series' object has no attribute [X] when preparing DataBlock

PabloMC · December 13, 2020, 4:52pm

I am having trouble running some basic code. I have a DataFrame called papers with one column called abstracts, and I am trying to create a DataBlock to load it in a model.
I prepare the data (in a Kaggle notebook with the Arxiv dataset) as a Dataframe as

import json

data_file = '../input/arxiv/arxiv-metadata-oai-snapshot.json'

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line
            
metadata = get_metadata()

titles = []
abstracts = []
years = []
refs = []
categories= []
metadata = get_metadata()
for paper in metadata:
    paper_dict = json.loads(paper)
    ref = paper_dict.get('journal-ref')
    try:
        year = int(ref[-4:]) 
        if 2010 < year < 2021 and ref is not None:
            years.append(year)
            titles.append(paper_dict.get('title'))
            abstracts.append(paper_dict.get('abstract'))
            refs.append(ref)
            categories.append(paper_dict.get('categories'))
    except:
        pass 

papers = pd.DataFrame({
    'title': titles,
    'abstract': abstracts,
    'year': years,
    'ref': refs,
    'category': categories
})

I have printed its head and everything looks quite right.

Then I attempt to create a DataBlock as

arxiv_lm = DataBlock(blocks=TextBlock.from_df(text_cols = 'abstract', is_lm=True),
                    get_x=ColReader('abstract'),
                    splitter = RandomSplitter(valid_pct=0.2, seed=None))

dls = arxiv_lm.dataloaders(papers, bs=64, seq_len=64)
dls.show_batch(max_n=6)

but I get error

AttributeError: 'Series' object has no attribute 'abstract'

The full error is

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-41-d464bdc30ac8> in <module>
      3                     splitter = RandomSplitter(valid_pct=0.2, seed=None))
      4 
----> 5 dls = arxiv_lm.dataloaders(papers, bs=64, seq_len=64)
      6 dls.show_batch(max_n=6)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
    111 
    112     def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 113         dsets = self.datasets(source, verbose=verbose)
    114         kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
    115         return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in datasets(self, source, verbose)
    108         splits = (self.splitter or RandomSplitter())(items)
    109         pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 110         return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
    111 
    112     def dataloaders(self, source, path='.', verbose=False, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
    308     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    309         super().__init__(dl_type=dl_type)
--> 310         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    311         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    312 

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in <listcomp>(.0)
    308     def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
    309         super().__init__(dl_type=dl_type)
--> 310         self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
    311         self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
    312 

/opt/conda/lib/python3.7/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
    120     def __call__(cls, x=None, *args, **kwargs):
    121         if not args and not kwargs and x is not None and isinstance(x,cls): return x
--> 122         return super().__call__(x, *args, **kwargs)
    123 
    124 # Cell

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
    234         if do_setup:
    235             pv(f"Setting up {self.tfms}", verbose)
--> 236             self.setup(train_setup=train_setup)
    237 
    238     def _new(self, items, split_idx=None, **kwargs):

/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in setup(self, train_setup)
    256             for f in self.tfms.fs:
    257                 self.types.append(getattr(f, 'input_types', type(x)))
--> 258                 x = f(x)
    259             self.types.append(type(x))
    260         types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()

/opt/conda/lib/python3.7/site-packages/fastai/data/transforms.py in __call__(self, o, **kwargs)
    198 
    199     def __call__(self, o, **kwargs):
--> 200         if len(self.cols) == 1: return self._do_one(o, self.cols[0])
    201         return L(self._do_one(o, c) for c in self.cols)
    202 

/opt/conda/lib/python3.7/site-packages/fastai/data/transforms.py in _do_one(self, r, c)
    192 
    193     def _do_one(self, r, c):
--> 194         o = r[c] if isinstance(c, int) else r[c] if c=='name' else getattr(r, c)
    195         if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
    196         if self.label_delim is None: return f'{self.pref}{o}{self.suff}'

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5139             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5140                 return self[name]
-> 5141             return object.__getattribute__(self, name)
   5142 
   5143     def __setattr__(self, name: str, value) -> None:

AttributeError: 'Series' object has no attribute 'abstract'

Do you know what I may be doing wrong?

muellerzr · December 13, 2020, 5:15pm

Your tokenized items appear in a text column, not your original text. So while your TextBlock has abstract, your get_x should be ColReader('text'). The exact field or parameter is tok_text_col

PabloMC · December 13, 2020, 5:32pm

Wow, that was quick and useful @muellerzr! Thanks a lot, it works.