Where to download data for these lessons 4 & 5 (Arxiv, Wikipedia, etc)?

alecrubin · May 22, 2018, 5:46am

For anyone interested, I edited the get_arxiv.py from the brundage bot project, to get the arXiv language model data for lessons 4 & 5. Converted the original code from py 2 to py 3, wrapped everything in a class, and fixed it to work if you do not have an existing arXiv pickle file. The only required packaged outside of the fastai environment is feedparser, If you don’t have it installed already, just do a quick pip install feedparser. Then you just do GetArXiv.update('data/all_arxiv.pkl') to run it. You can also do GetArXiv.load('data/all_arxiv.pkl') to load the data.

import os, requests, time
import feedparser
import pandas as pd


class GetArXiv(object):
	def __init__(self, pickle_path, categories=list()):
		"""
		:param pickle_path (str): path to pickle data file to save/load
		:param pickle_name (str): file name to save pickle to path
		:param categories (list): arXiv categories to query
		"""
		if os.path.isdir(pickle_path):
			pickle_path = f"{pickle_path}{'' if pickle_path[-1] == '/' else '/'}all_arxiv.pkl"
		if len(categories) < 1:
			categories = ['cs*', 'cond-mat.dis-nn', 'q-bio.NC', 'stat.CO', 'stat.ML']
		# categories += ['cs.CV', 'cs.AI', 'cs.LG', 'cs.CL']

		self.categories = categories
		self.pickle_path = pickle_path
		self.base_url = 'http://export.arxiv.org/api/query'

	@staticmethod
	def build_qs(categories):
		"""Build query string from categories"""
		return '+OR+'.join(['cat:'+c for c in categories])

	@staticmethod
	def get_entry_dict(entry):
		"""Return a dictionary with the items we want from a feedparser entry"""
		try:
			return dict(title=entry['title'], authors=[a['name'] for a in entry['authors']],
			            published=pd.Timestamp(entry['published']), summary=entry['summary'],
			            link=entry['link'], category=entry['category'])
		except KeyError:
			print('Missing keys in row: {}'.format(entry))
			return None

	@staticmethod
	def strip_version(link):
		"""Strip version number from arXiv paper link"""
		return link[:-2]

	def fetch_updated_data(self, max_retry=5, pg_offset=0, pg_size=1000, wait_time=15):
		"""
		Get new papers from arXiv server
		:param max_retry: max number of time to retry request
		:param pg_offset: number of pages to offset
		:param pg_size: num abstracts to fetch per request
		:param wait_time: num seconds to wait between requests
		"""
		i, retry = pg_offset, 0
		df = pd.DataFrame()
		past_links = []
		if os.path.isfile(self.pickle_path):
			df = pd.read_pickle(self.pickle_path)
			df.reset_index()
		if len(df) > 0: past_links = df.link.apply(self.strip_version)

		while True:
			params = dict(search_query=self.build_qs(self.categories),
			              sortBy='submittedDate', start=pg_size*i, max_results=pg_size)
			response = requests.get(self.base_url, params='&'.join([f'{k}={v}' for k, v in params.items()]))
			entries = feedparser.parse(response.text).entries
			if len(entries) < 1:
				if retry < max_retry:
					retry += 1
					time.sleep(wait_time)
					continue
				break

			results_df = pd.DataFrame([self.get_entry_dict(e) for e in entries])
			max_date = results_df.published.max().date()
			new_links = ~results_df.link.apply(self.strip_version).isin(past_links)
			print(f'{i}. Fetched {len(results_df)} abstracts published {max_date} and earlier')
			if not new_links.any():
				break

			df = pd.concat((df, results_df.loc[new_links]), ignore_index=True)
			i += 1
			retry = 0
			time.sleep(wait_time)

		print(f'Downloaded {len(df)-len(past_links)} new abstracts')
		df.sort_values('published', ascending=False).groupby('link').first().reset_index()
		df.to_pickle(self.pickle_path)
		return df

	@classmethod
	def load(cls, pickle_path):
		"""Load data from pickle and remove duplicates"""
		return pd.read_pickle(cls(pickle_path).pickle_path)

	@classmethod
	def update(cls, pickle_path, categories=list(), **kwargs):
		"""
		Update arXiv data pickle with the latest abstracts
		"""
		cls(pickle_path, categories).fetch_updated_data(**kwargs)
		return True