Twitter-Data-Analysis/extract_dataframe.py at main · Fisseha-Estifanos/Twitter-Data-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import json
import pandas as pd
import numpy as np
from textblob import TextBlob
from defaults import *


def read_json(json_file: str) -> list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file

    Returns
    -------
    length of the json file and a list of json
    """

    tweets_data = []
    for tweets in open(json_file, 'r'):
        tweets_data.append(json.loads(tweets))
    return len(tweets_data), tweets_data

class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe

    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        """
        The initializer for the TweetDf Extractor class
        """
        self.tweets_list = tweets_list

    def find_statuses_count(self)->list:
        """
        an example function
        """
        statuses_count = [x['user']['statuses_count']
                          for x in self.tweets_list]
        return statuses_count

    def find_full_text(self)->list:
        """
        a function to find and return full text of a twit from a dataframe
        """
        text = []
        for x in self.tweets_list:
            try:
                text.append(x['full_text'])
            except KeyError:
                #text.append(x['text'])
                text.append('NA')
        return text

    def find_sentiments(self, text)->list:
        """
        a function to find and return polarity and subjectivity of a twit
        """
        polarity = [TextBlob(x).polarity for x in text]
        subjectivity = [TextBlob(x).subjectivity for x in text]
        return (polarity, subjectivity)

    def find_created_time(self)->list:
        """
        a function to find and return the date the twit was created at
        """
        created_at = [x['created_at'] for x in self.tweets_list]
        return created_at

    def find_source(self)->list:
        """
        a function to find and return the source of a tweet
        """
        source = [x['source'] for x in self.tweets_list]
        return source

    def find_screen_name(self)->list:
        """
        a function to find and return the screen name from where the
        tweet originated
        """
        screen_name = [x['user']['screen_name'] for x in self.tweets_list]
        return screen_name

    def find_followers_count(self)->list:
        """
        function to find and return the follower count of a twitter
        """
        followers_count = [x['user']['followers_count'] for x in
                           self.tweets_list]
        return followers_count

    def find_friends_count(self)->list:
        """
        function to find and return the friends count of a twitter
        """
        friends_count = [x['user']['friends_count'] for x in self.tweets_list]
        return  friends_count

    def is_sensitive(self)->list:
        """
        try:
            is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list]
        except KeyError:
            is_sensitive = None
        return is_sensitive
        """
        # function to find and return the possible sensitivity of a tweet
        is_sensitive = []
        for tweet in self.tweets_list:
            if 'possibly_sensitive' in tweet.keys():
                is_sensitive.append(tweet['possibly_sensitive'])
            else:
                is_sensitive.append(None)
        return is_sensitive

    def find_favorite_count(self)->list:
        """
        function to find and return the favorite count of a tweet
        """
        favorite_count = []
        for tweet in self.tweets_list:
            if 'retweeted_status' in tweet.keys():
                favorite_count.append(
                                tweet['retweeted_status']['favorite_count'])
            else:
                favorite_count.append(0)
        return favorite_count

    def find_retweet_count(self)->list:
        """
        function to find and return the retweet count of a tweet
        """
        retweet_count = []
        for tweet in self.tweets_list:
            if 'retweeted_status' in tweet.keys():
                retweet_count.append(
                                tweet['retweeted_status']['retweet_count'])
            else:
                retweet_count.append(0)
        return retweet_count

    def find_hashtags(self)->list:
        """
        function to find and return the hashtags of a tweet
        """
        hashtags = [x['entities']['hashtags'] for x in self.tweets_list]
        return hashtags

    def find_mentions(self)->list:
        """
        function to find and return the mentions of a tweet
        """
        mentions = [x['entities']['user_mentions'] for x in self.tweets_list]
        return mentions

    def find_location(self)->list:
        """
        function to find and return the location of a tweet
        """
        location = [x.get('user', {}).get('location', None) for x in
                    self.tweets_list]
        return location

    def find_lang(self) -> list:
        """
        function to find and return the language of a tweet
        """
        lang = [x['lang'] for x in self.tweets_list]
        return lang

    # TODO : make this method
    def find_authors(self) -> list:
        """
        function to find and return authors of tweets
        """
        authors = []
        for x in range(22000):
            authors.append(x)
        return authors

    def get_tweet_df(self, save: bool=False, save_as : str = 'processed_tweet_data', as_csv : bool = False) -> pd.DataFrame:
        """
        required columns to be generated
        """
        # added_column_Names = ['status_count', 'screen_name']
        selected_columns = ['created_at', 'source', 'original_text','polarity',
                   'subjectivity', 'lang', 'favorite_count', 'status_count',
                   'retweet_count', 'screen_name', 'original_author',
                   'followers_count','friends_count','possibly_sensitive',
                   'hashtags', 'user_mentions', 'place']

        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favorite_count()
        status_count = self.find_statuses_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        author = self.find_screen_name()
        followers_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()

        selected_data = [created_at, source, text, polarity, subjectivity, lang, fav_count, status_count, retweet_count, screen_name, author, followers_count, friends_count, sensitivity, hashtags, mentions, location]

        sel_data = {}
        for i in range(0, len(selected_columns), 1):
            sel_data[selected_columns[i]] = selected_data[i]

        final_dataframe = pd.DataFrame(data = sel_data)

        """print({len(status_count)}, {len(created_at)}, {len(source)},
               {len(text)}, {len(polarity)}, {len(subjectivity)},
               {len(fav_count)}, {len(retweet_count)}, {len(screen_name)},
               {len(followers_count)}, {len(friends_count)},
               {len(sensitivity)}, {len(hashtags)}, {len(mentions)},
               {len(location)}, {len(lang)}, {len(author)})"""

        """print(status_count, created_at, source,
               text, polarity, subjectivity,
               fav_count, retweet_count, screen_name,
               followers_count, friends_count,
               sensitivity, hashtags, mentions,
               location, lang, author)"""

        """print({type(status_count)}, {type(created_at)}, {type(source)},
                {type(text)}, {type(polarity)}, {type(subjectivity)},
                {type(fav_count)}, {type(retweet_count)}, {type(screen_name)},
                {type(followers_count)}, {type(friends_count)},
                {type(sensitivity)}, {type(hashtags)}, {type(mentions)},
                {type(location)}, {type(lang)}, {type(author)})"""

        if save:
            if as_csv:
                data_path = 'data/' + save_as + '.csv'
                final_dataframe.to_csv(data_path, index=False)
                print(f'File {save_as} successfully saved as {data_path}')
            else:
                data_path = 'data/' + save_as + '.json'
                final_dataframe.to_json(data_path, indent=4)
                print(f'File {save_as} successfully saved as {data_path}')
        return final_dataframe


if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count',
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']

    # for the global data set
    _, global_tweet_list = read_json(global_data)
     # to make sure all the data is passe to he
    print(f"Total number of data: {_}")
    global_tweet = TweetDfExtractor(global_tweet_list)
    global_tweet_df = global_tweet.get_tweet_df(save= True, save_as='processed_global_tweet_data')
    print(global_tweet_df)

    """# for the african data set
    _, african_tweet_list = read_json(african_data)
     # to make sure all the data is passe to he
    print(f"Total number of data: {_}")
    african_tweet = TweetDfExtractor(african_tweet_list)
    african_tweet_df = african_tweet.get_tweet_df(save = True, save_as='processed_african_tweet_data')
    print(african_tweet_df)"""

    # TODO : use all defined functions to generate a dataframe with the specified columns above