Twitter-Data-Analysis/clean_tweets_dataframe.py at main · Fisseha-Estifanos/Twitter-Data-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import re
import pandas as pd
from defaults import *

class Clean_Tweets:
    """
    The PEP8 Standard AMAZING!!!
    """
    def __init__(self, df:pd.DataFrame):
        self.df = df
        print('Automation in Action...!!!')

    def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
        """
        remove rows that has column names. This error originated from
        the data collection stage.
        """
        unwanted_rows = self.df[self.df['retweet_count'] == 'retweet_count' ].index
        self.df.drop(unwanted_rows , inplace=True)
        self.df = self.df[self.df['polarity'] != 'polarity']
        return df

    def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
        """
        drop duplicate rows
        """
        self.df.drop_duplicates(subset='original_text', inplace=True)
        return df

    def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
        """
        convert column to datetime
        """
        self.df['created_at'] = pd.to_datetime(self.df['created_at'], errors='coerce')
        return df

    def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
        """
        convert columns like polarity, subjectivity, retweet_count
        favorite_count etc to numbers
        """
        self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
        self.df['subjectivity'] = pd.to_numeric(self.df['subjectivity'],
                                                errors='coerce')
        self.df['listed_count'] = pd.to_numeric(self.df['listed_count'],
                                                errors='coerce')
        self.df['retweet_count'] = pd.to_numeric(self.df['retweet_count'],
                                                 errors='coerce')
        self.df['friends_count'] = pd.to_numeric(self.df['friends_count'],
                                                 errors='coerce')
        self.df['favorite_count'] = pd.to_numeric(self.df['favorite_count'],
                                                  errors='coerce')
        self.df['statuses_count'] = pd.to_numeric(self.df['statuses_count'],
                                                  errors='coerce')
        self.df['followers_count'] = pd.to_numeric(self.df['followers_count'],
                                                   errors='coerce')
        self.df['polarity'] = pd.to_numeric(self.df['polarity'],
                                            errors='coerce')
        return df

    def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame:
        """
        remove non english tweets from lang
        """
        self.df.query("lang == 'en'", inplace=True)
        return df

    def drop_nulls(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        drop nulls
        """
        self.df = self.df.dropna(axis=0, how='any', inplace=False)
        return df

    def find_hashtags(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to find hashtags from tweets
        This function will extract hashtags
        """
        self.df = re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', df)
        return df

    def text_category(self, series: pd.Series) -> list:
        """
        function that return positive, negative or neutral based on polarity
        """
        polarities = []
        for pol in series:
            if pol >= 0.00000000001:
                polarities.append("positive")
            elif pol == 0.00000000000:
                polarities.append("neutral")
            elif pol <= -0.00000000001:
                polarities.append("negative")
            else:
                polarities.append('UNK')
        return polarities

    def fill_missing(self, df: pd.DataFrame, column: str, value: str = 'unknown'):
        """
        fill null values of a specific column with the provided value
        """

        df[column] = df[column].fillna(value)

        return df

    def replace_empty_string(self, df:pd.DataFrame, column: str, value: str):
        """
        replace empty strings in a specific column with the provided value
        """

        df[column] = df[column].apply(lambda x: value if x == "" else x)

        return df

    def remove_characters(self, df: pd.DataFrame, column: str):
        """
        removes non-alphanumeric characters with the exception of underscore hyphen and space
        from the specified column
        """

        df[column] = df[column].apply(lambda text: re.sub("[^a-zA-Z0-9\s_-]", "", text))

        return df

    def extract_device_name(self, df: pd.DataFrame) -> pd.Series:
        """
        returns device name from source text
        """
        df["source"] = df["source"].str.replace(r"(\s*\<.*?\>\s*)", " ", regex=True).str.strip()

        # this works for a single row,where source a string
        # res = re.split('<|>', source)[2].strip()

        return df["source"]

if __name__ == "__main__":
    """
    read the twitter dataset and Pass the data to the Clean_Tweets
    class
    """
    global_tweet_df = pd.read_json(global_data, lines=True)
    global_cleaner = Clean_Tweets(global_tweet_df)

    african_tweet_df = pd.read_json(african_data, lines=True)
    african_cleaner = Clean_Tweets(african_tweet_df)