For anyone interested, I edited the get_arxiv.py from the brundage bot project, to get the arXiv language model data for lessons 4 & 5. Converted the original code from py 2 to py 3, wrapped everything in a class, and fixed it to work if you do not have an existing arXiv pickle file. The only required packaged outside of the fastai environment is feedparser, If you don’t have it installed already, just do a quick pip install feedparser. Then you just do GetArXiv.update('data/all_arxiv.pkl') to run it. You can also do GetArXiv.load('data/all_arxiv.pkl') to load the data.
import os, requests, time
import feedparser
import pandas as pd
class GetArXiv(object):
def __init__(self, pickle_path, categories=list()):
"""
:param pickle_path (str): path to pickle data file to save/load
:param pickle_name (str): file name to save pickle to path
:param categories (list): arXiv categories to query
"""
if os.path.isdir(pickle_path):
pickle_path = f"{pickle_path}{'' if pickle_path[-1] == '/' else '/'}all_arxiv.pkl"
if len(categories) < 1:
categories = ['cs*', 'cond-mat.dis-nn', 'q-bio.NC', 'stat.CO', 'stat.ML']
# categories += ['cs.CV', 'cs.AI', 'cs.LG', 'cs.CL']
self.categories = categories
self.pickle_path = pickle_path
self.base_url = 'http://export.arxiv.org/api/query'
@staticmethod
def build_qs(categories):
"""Build query string from categories"""
return '+OR+'.join(['cat:'+c for c in categories])
@staticmethod
def get_entry_dict(entry):
"""Return a dictionary with the items we want from a feedparser entry"""
try:
return dict(title=entry['title'], authors=[a['name'] for a in entry['authors']],
published=pd.Timestamp(entry['published']), summary=entry['summary'],
link=entry['link'], category=entry['category'])
except KeyError:
print('Missing keys in row: {}'.format(entry))
return None
@staticmethod
def strip_version(link):
"""Strip version number from arXiv paper link"""
return link[:-2]
def fetch_updated_data(self, max_retry=5, pg_offset=0, pg_size=1000, wait_time=15):
"""
Get new papers from arXiv server
:param max_retry: max number of time to retry request
:param pg_offset: number of pages to offset
:param pg_size: num abstracts to fetch per request
:param wait_time: num seconds to wait between requests
"""
i, retry = pg_offset, 0
df = pd.DataFrame()
past_links = []
if os.path.isfile(self.pickle_path):
df = pd.read_pickle(self.pickle_path)
df.reset_index()
if len(df) > 0: past_links = df.link.apply(self.strip_version)
while True:
params = dict(search_query=self.build_qs(self.categories),
sortBy='submittedDate', start=pg_size*i, max_results=pg_size)
response = requests.get(self.base_url, params='&'.join([f'{k}={v}' for k, v in params.items()]))
entries = feedparser.parse(response.text).entries
if len(entries) < 1:
if retry < max_retry:
retry += 1
time.sleep(wait_time)
continue
break
results_df = pd.DataFrame([self.get_entry_dict(e) for e in entries])
max_date = results_df.published.max().date()
new_links = ~results_df.link.apply(self.strip_version).isin(past_links)
print(f'{i}. Fetched {len(results_df)} abstracts published {max_date} and earlier')
if not new_links.any():
break
df = pd.concat((df, results_df.loc[new_links]), ignore_index=True)
i += 1
retry = 0
time.sleep(wait_time)
print(f'Downloaded {len(df)-len(past_links)} new abstracts')
df.sort_values('published', ascending=False).groupby('link').first().reset_index()
df.to_pickle(self.pickle_path)
return df
@classmethod
def load(cls, pickle_path):
"""Load data from pickle and remove duplicates"""
return pd.read_pickle(cls(pickle_path).pickle_path)
@classmethod
def update(cls, pickle_path, categories=list(), **kwargs):
"""
Update arXiv data pickle with the latest abstracts
"""
cls(pickle_path, categories).fetch_updated_data(**kwargs)
return True