-
Notifications
You must be signed in to change notification settings - Fork 3
/
tweet-lookup.py
166 lines (131 loc) · 4.78 KB
/
tweet-lookup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
# Script Information
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
PURPOSE:
- Script to scrape tweets with the Twitter V2 get tweets endpoint
INPUT:
- A file of tweet IDs where each line contains one id
OUTPUT:
- tweet_data--{todays-date}.json : a file where each line
represents one tweet
- tweet_errors--{todays-date}.json : a file which records any
errors received (one per line). You can then learn why certain ids
were not returned.
Author: Matthew R. DeVerna, Kaicheng Yang
"""
# Import packages
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
import argparse
import os
import json
from datetime import datetime as dt
import osometweet
from osometweet.utils import chunker
# Create Functions.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def parse_cl_args():
"""Set CLI Arguments."""
# Initiate the parser
parser = argparse.ArgumentParser(
description="Script to scrape tweet information."
)
# Add optional arguments
parser.add_argument(
"-f", "--file",
metavar='File',
help="Full path to the file containing the "
"USER IDS you would like to scrape.",
required=True
)
# Read parsed arguments from the command line into "args"
args = parser.parse_args()
# Assign the file name to a variable and return it
ids_file = args.file
return ids_file
def load_tweet_ids(ids_file):
"""
Load all tweet ids, returning a list of lists, each 100
users long.
"""
with open(ids_file, 'r') as f:
tweet_ids = [x.strip('\n') for x in f.readlines()]
max_query_length = 100
# This allows us to iterate through a long list of tweet ids
# 100 tweets at a time (which is the maximum number of ids
# we can query Twitter for in one call).
chunked_list = chunker(
seq=tweet_ids,
size=max_query_length
)
return chunked_list
def load_bearer_token():
"""Load Twitter Keys from Local Environment."""
# To set your environment variables in your terminal execute a command
# like the one that you see below.
# Example:
# export 'TWITTER_BEARER_TOKEN'='<your_twitter_bearer_token>'
# Do this for all of your tokens, and then load them with the commands
# below, matching the string in the .get("string") to the name you've
# chosen to the left of the equal sign above.
# Set Twitter tokens/keys.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bearer_token = os.environ.get("TWITTER_BEARER_TOKEN")
return bearer_token
def gather_data(
bearer_token,
chunked_list
):
"""
Gather tweets based on the chunked list of tweet IDs with the provided
bearer_token.
"""
print("Gathering Data...")
oauth2 = osometweet.OAuth2(bearer_token=bearer_token)
ot = osometweet.OsomeTweet(oauth2)
# Add all tweet fields
all_tweet_fields = osometweet.TweetFields(everything=True)
# Get today's date
today = dt.strftime(dt.today(), "%Y-%m-%d_%H-%M")
# Open two files. One for good data, the other for tweet errors.
with open(f"tweet_data--{today}.json", 'w') as data_file,\
open(f"tweet_errors--{today}.json", 'w') as error_file:
# Iterate through the list of lists
for one_hundred_tweets in chunked_list:
response = ot.tweet_lookup(
tids=one_hundred_tweets,
fields=all_tweet_fields
)
# Get data and errors
data = response["data"]
errors = response["errors"]
# No matter what `data` and `errors` will return something,
# however, they may return `None` (i.e. no data/errors), which
# will throw a TypeError.
try:
data_file.writelines(f"{json.dumps(line)}\n" for line in data)
except TypeError:
print(
"No data found in this set of tweets, "
"skipping to the next set."
)
try:
error_file.writelines(
f"{json.dumps(line)}\n" for line in errors
)
except TypeError:
print(
"No problematic tweets found in this set of tweets, "
"skipping to the next set."
)
# Execute the program
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == "__main__":
ids_file = parse_cl_args()
chunked_list = load_tweet_ids(ids_file)
bearer_token = load_bearer_token()
gather_data(
bearer_token=bearer_token,
chunked_list=chunked_list
)
print("Data pull complete.")