FWIW on this I am going to try to pull down a larger Twitter (spanish) data set for LM modeling (or fine-tuning) Script is pretty simple if you have Tweepy installed and have an “authorized” account (just a matter of requesting it.) Code is below if it helps anyone else out. the text file will only be good for LM training (single column with just the full tweet text in it)
import tweepy
from config_twitter import Config_Twitter
import json
import os
from ipdb import set_trace
#config params
my_config = Config_Twitter()
consumer_key = my_config.CONSUMER_KEY
consumer_secret = my_config.CONSUMER_SECRET
access_token = my_config.ACCESS_TOKEN
access_token_secret = my_config.ACCESS_TOKEN_SECRET
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
class StdOutListener(tweepy.streaming.StreamListener):
def __init__(self,out_fn):
self.fhOut = open(out_fn,'a')
self.fhOut.write("tweet_text" + os.linesep)
super().__init__()
#This function gets called every time a new tweet is received on the stream
def on_data(self, data):
j=json.loads(data)
try:
if 'extended_tweet' in j.keys():
text = j['extended_tweet']['full_text']
print(text)
self.fhOut.write('"' + text + '"' + os.linesep)
except KeyError:
pass
def on_error(self, status):
print("ERROR")
print(status)
try:
l = StdOutListener('out_file.txt')
stream = tweepy.Stream(auth,l)
stream.filter(languages=["es"],track=['el','su','lo','y','en'])
except KeyboardInterrupt:
pass