I am having trouble running some basic code. I have a DataFrame called papers
with one column called abstracts
, and I am trying to create a DataBlock to load it in a model.
I prepare the data (in a Kaggle notebook with the Arxiv dataset) as a Dataframe as
import json
data_file = '../input/arxiv/arxiv-metadata-oai-snapshot.json'
def get_metadata():
with open(data_file, 'r') as f:
for line in f:
yield line
metadata = get_metadata()
titles = []
abstracts = []
years = []
refs = []
categories= []
metadata = get_metadata()
for paper in metadata:
paper_dict = json.loads(paper)
ref = paper_dict.get('journal-ref')
try:
year = int(ref[-4:])
if 2010 < year < 2021 and ref is not None:
years.append(year)
titles.append(paper_dict.get('title'))
abstracts.append(paper_dict.get('abstract'))
refs.append(ref)
categories.append(paper_dict.get('categories'))
except:
pass
papers = pd.DataFrame({
'title': titles,
'abstract': abstracts,
'year': years,
'ref': refs,
'category': categories
})
I have printed its head and everything looks quite right.
Then I attempt to create a DataBlock as
arxiv_lm = DataBlock(blocks=TextBlock.from_df(text_cols = 'abstract', is_lm=True),
get_x=ColReader('abstract'),
splitter = RandomSplitter(valid_pct=0.2, seed=None))
dls = arxiv_lm.dataloaders(papers, bs=64, seq_len=64)
dls.show_batch(max_n=6)
but I get error
AttributeError: 'Series' object has no attribute 'abstract'
The full error is
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-41-d464bdc30ac8> in <module>
3 splitter = RandomSplitter(valid_pct=0.2, seed=None))
4
----> 5 dls = arxiv_lm.dataloaders(papers, bs=64, seq_len=64)
6 dls.show_batch(max_n=6)
/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in dataloaders(self, source, path, verbose, **kwargs)
111
112 def dataloaders(self, source, path='.', verbose=False, **kwargs):
--> 113 dsets = self.datasets(source, verbose=verbose)
114 kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
115 return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)
/opt/conda/lib/python3.7/site-packages/fastai/data/block.py in datasets(self, source, verbose)
108 splits = (self.splitter or RandomSplitter())(items)
109 pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
--> 110 return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)
111
112 def dataloaders(self, source, path='.', verbose=False, **kwargs):
/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in __init__(self, items, tfms, tls, n_inp, dl_type, **kwargs)
308 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
309 super().__init__(dl_type=dl_type)
--> 310 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
311 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
312
/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in <listcomp>(.0)
308 def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
309 super().__init__(dl_type=dl_type)
--> 310 self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
311 self.n_inp = ifnone(n_inp, max(1, len(self.tls)-1))
312
/opt/conda/lib/python3.7/site-packages/fastcore/foundation.py in __call__(cls, x, *args, **kwargs)
120 def __call__(cls, x=None, *args, **kwargs):
121 if not args and not kwargs and x is not None and isinstance(x,cls): return x
--> 122 return super().__call__(x, *args, **kwargs)
123
124 # Cell
/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in __init__(self, items, tfms, use_list, do_setup, split_idx, train_setup, splits, types, verbose, dl_type)
234 if do_setup:
235 pv(f"Setting up {self.tfms}", verbose)
--> 236 self.setup(train_setup=train_setup)
237
238 def _new(self, items, split_idx=None, **kwargs):
/opt/conda/lib/python3.7/site-packages/fastai/data/core.py in setup(self, train_setup)
256 for f in self.tfms.fs:
257 self.types.append(getattr(f, 'input_types', type(x)))
--> 258 x = f(x)
259 self.types.append(type(x))
260 types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()
/opt/conda/lib/python3.7/site-packages/fastai/data/transforms.py in __call__(self, o, **kwargs)
198
199 def __call__(self, o, **kwargs):
--> 200 if len(self.cols) == 1: return self._do_one(o, self.cols[0])
201 return L(self._do_one(o, c) for c in self.cols)
202
/opt/conda/lib/python3.7/site-packages/fastai/data/transforms.py in _do_one(self, r, c)
192
193 def _do_one(self, r, c):
--> 194 o = r[c] if isinstance(c, int) else r[c] if c=='name' else getattr(r, c)
195 if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
196 if self.label_delim is None: return f'{self.pref}{o}{self.suff}'
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
5139 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5140 return self[name]
-> 5141 return object.__getattribute__(self, name)
5142
5143 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'abstract'
Do you know what I may be doing wrong?