forked from 10xac/Twitter-Data-Analysis-Template
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_dataframe.py
More file actions
277 lines (241 loc) · 10.2 KB
/
extract_dataframe.py
File metadata and controls
277 lines (241 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import json
import pandas as pd
import numpy as np
from textblob import TextBlob
from defaults import *
def read_json(json_file: str) -> list:
"""
json file reader to open and read json files into a list
Args:
-----
json_file: str - path of a json file
Returns
-------
length of the json file and a list of json
"""
tweets_data = []
for tweets in open(json_file, 'r'):
tweets_data.append(json.loads(tweets))
return len(tweets_data), tweets_data
class TweetDfExtractor:
"""
this function will parse tweets json into a pandas dataframe
Return
------
dataframe
"""
def __init__(self, tweets_list):
"""
The initializer for the TweetDf Extractor class
"""
self.tweets_list = tweets_list
def find_statuses_count(self)->list:
"""
an example function
"""
statuses_count = [x['user']['statuses_count']
for x in self.tweets_list]
return statuses_count
def find_full_text(self)->list:
"""
a function to find and return full text of a twit from a dataframe
"""
text = []
for x in self.tweets_list:
try:
text.append(x['full_text'])
except KeyError:
#text.append(x['text'])
text.append('NA')
return text
def find_sentiments(self, text)->list:
"""
a function to find and return polarity and subjectivity of a twit
"""
polarity = [TextBlob(x).polarity for x in text]
subjectivity = [TextBlob(x).subjectivity for x in text]
return (polarity, subjectivity)
def find_created_time(self)->list:
"""
a function to find and return the date the twit was created at
"""
created_at = [x['created_at'] for x in self.tweets_list]
return created_at
def find_source(self)->list:
"""
a function to find and return the source of a tweet
"""
source = [x['source'] for x in self.tweets_list]
return source
def find_screen_name(self)->list:
"""
a function to find and return the screen name from where the
tweet originated
"""
screen_name = [x['user']['screen_name'] for x in self.tweets_list]
return screen_name
def find_followers_count(self)->list:
"""
function to find and return the follower count of a twitter
"""
followers_count = [x['user']['followers_count'] for x in
self.tweets_list]
return followers_count
def find_friends_count(self)->list:
"""
function to find and return the friends count of a twitter
"""
friends_count = [x['user']['friends_count'] for x in self.tweets_list]
return friends_count
def is_sensitive(self)->list:
"""
try:
is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list]
except KeyError:
is_sensitive = None
return is_sensitive
"""
# function to find and return the possible sensitivity of a tweet
is_sensitive = []
for tweet in self.tweets_list:
if 'possibly_sensitive' in tweet.keys():
is_sensitive.append(tweet['possibly_sensitive'])
else:
is_sensitive.append(None)
return is_sensitive
def find_favorite_count(self)->list:
"""
function to find and return the favorite count of a tweet
"""
favorite_count = []
for tweet in self.tweets_list:
if 'retweeted_status' in tweet.keys():
favorite_count.append(
tweet['retweeted_status']['favorite_count'])
else:
favorite_count.append(0)
return favorite_count
def find_retweet_count(self)->list:
"""
function to find and return the retweet count of a tweet
"""
retweet_count = []
for tweet in self.tweets_list:
if 'retweeted_status' in tweet.keys():
retweet_count.append(
tweet['retweeted_status']['retweet_count'])
else:
retweet_count.append(0)
return retweet_count
def find_hashtags(self)->list:
"""
function to find and return the hashtags of a tweet
"""
hashtags = [x['entities']['hashtags'] for x in self.tweets_list]
return hashtags
def find_mentions(self)->list:
"""
function to find and return the mentions of a tweet
"""
mentions = [x['entities']['user_mentions'] for x in self.tweets_list]
return mentions
def find_location(self)->list:
"""
function to find and return the location of a tweet
"""
location = [x.get('user', {}).get('location', None) for x in
self.tweets_list]
return location
def find_lang(self) -> list:
"""
function to find and return the language of a tweet
"""
lang = [x['lang'] for x in self.tweets_list]
return lang
# TODO : make this method
def find_authors(self) -> list:
"""
function to find and return authors of tweets
"""
authors = []
for x in range(22000):
authors.append(x)
return authors
def get_tweet_df(self, save: bool=False, save_as : str = 'processed_tweet_data', as_csv : bool = False) -> pd.DataFrame:
"""
required columns to be generated
"""
# added_column_Names = ['status_count', 'screen_name']
selected_columns = ['created_at', 'source', 'original_text','polarity',
'subjectivity', 'lang', 'favorite_count', 'status_count',
'retweet_count', 'screen_name', 'original_author',
'followers_count','friends_count','possibly_sensitive',
'hashtags', 'user_mentions', 'place']
created_at = self.find_created_time()
source = self.find_source()
text = self.find_full_text()
polarity, subjectivity = self.find_sentiments(text)
lang = self.find_lang()
fav_count = self.find_favorite_count()
status_count = self.find_statuses_count()
retweet_count = self.find_retweet_count()
screen_name = self.find_screen_name()
author = self.find_screen_name()
followers_count = self.find_followers_count()
friends_count = self.find_friends_count()
sensitivity = self.is_sensitive()
hashtags = self.find_hashtags()
mentions = self.find_mentions()
location = self.find_location()
selected_data = [created_at, source, text, polarity, subjectivity, lang, fav_count, status_count, retweet_count, screen_name, author, followers_count, friends_count, sensitivity, hashtags, mentions, location]
sel_data = {}
for i in range(0, len(selected_columns), 1):
sel_data[selected_columns[i]] = selected_data[i]
final_dataframe = pd.DataFrame(data = sel_data)
"""print({len(status_count)}, {len(created_at)}, {len(source)},
{len(text)}, {len(polarity)}, {len(subjectivity)},
{len(fav_count)}, {len(retweet_count)}, {len(screen_name)},
{len(followers_count)}, {len(friends_count)},
{len(sensitivity)}, {len(hashtags)}, {len(mentions)},
{len(location)}, {len(lang)}, {len(author)})"""
"""print(status_count, created_at, source,
text, polarity, subjectivity,
fav_count, retweet_count, screen_name,
followers_count, friends_count,
sensitivity, hashtags, mentions,
location, lang, author)"""
"""print({type(status_count)}, {type(created_at)}, {type(source)},
{type(text)}, {type(polarity)}, {type(subjectivity)},
{type(fav_count)}, {type(retweet_count)}, {type(screen_name)},
{type(followers_count)}, {type(friends_count)},
{type(sensitivity)}, {type(hashtags)}, {type(mentions)},
{type(location)}, {type(lang)}, {type(author)})"""
if save:
if as_csv:
data_path = 'data/' + save_as + '.csv'
final_dataframe.to_csv(data_path, index=False)
print(f'File {save_as} successfully saved as {data_path}')
else:
data_path = 'data/' + save_as + '.json'
final_dataframe.to_json(data_path, indent=4)
print(f'File {save_as} successfully saved as {data_path}')
return final_dataframe
if __name__ == "__main__":
# required column to be generated you should be creative and add more features
columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count',
'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
# for the global data set
_, global_tweet_list = read_json(global_data)
# to make sure all the data is passe to he
print(f"Total number of data: {_}")
global_tweet = TweetDfExtractor(global_tweet_list)
global_tweet_df = global_tweet.get_tweet_df(save= True, save_as='processed_global_tweet_data')
print(global_tweet_df)
"""# for the african data set
_, african_tweet_list = read_json(african_data)
# to make sure all the data is passe to he
print(f"Total number of data: {_}")
african_tweet = TweetDfExtractor(african_tweet_list)
african_tweet_df = african_tweet.get_tweet_df(save = True, save_as='processed_african_tweet_data')
print(african_tweet_df)"""
# TODO : use all defined functions to generate a dataframe with the specified columns above