NLP challenge project

bfarzin · May 2, 2019, 10:08pm

FWIW on this I am going to try to pull down a larger Twitter (spanish) data set for LM modeling (or fine-tuning) Script is pretty simple if you have Tweepy installed and have an “authorized” account (just a matter of requesting it.) Code is below if it helps anyone else out. the text file will only be good for LM training (single column with just the full tweet text in it)

import tweepy
from config_twitter import Config_Twitter
import json
import os

from ipdb import set_trace

#config params
my_config = Config_Twitter()
consumer_key = my_config.CONSUMER_KEY
consumer_secret = my_config.CONSUMER_SECRET
access_token = my_config.ACCESS_TOKEN
access_token_secret = my_config.ACCESS_TOKEN_SECRET

auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)


class StdOutListener(tweepy.streaming.StreamListener):
    def __init__(self,out_fn):
        self.fhOut = open(out_fn,'a')
        self.fhOut.write("tweet_text" + os.linesep)
        super().__init__()
    #This function gets called every time a new tweet is received on the stream
    def on_data(self, data):
        j=json.loads(data)
        try:
            if 'extended_tweet' in j.keys():
                text = j['extended_tweet']['full_text']
                print(text)
                self.fhOut.write('"' + text + '"' + os.linesep)
        except KeyError:
            pass

    def on_error(self, status):
        print("ERROR")
        print(status)

try:
    l = StdOutListener('out_file.txt')
    stream = tweepy.Stream(auth,l)
    stream.filter(languages=["es"],track=['el','su','lo','y','en'])
except KeyboardInterrupt:
    pass