forked from 10xac/Twitter-Data-Analysis-Template
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_tweets_dataframe.py
More file actions
147 lines (124 loc) · 5.31 KB
/
clean_tweets_dataframe.py
File metadata and controls
147 lines (124 loc) · 5.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import re
import pandas as pd
from defaults import *
class Clean_Tweets:
"""
The PEP8 Standard AMAZING!!!
"""
def __init__(self, df:pd.DataFrame):
self.df = df
print('Automation in Action...!!!')
def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
"""
remove rows that has column names. This error originated from
the data collection stage.
"""
unwanted_rows = self.df[self.df['retweet_count'] == 'retweet_count' ].index
self.df.drop(unwanted_rows , inplace=True)
self.df = self.df[self.df['polarity'] != 'polarity']
return df
def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
"""
drop duplicate rows
"""
self.df.drop_duplicates(subset='original_text', inplace=True)
return df
def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
"""
convert column to datetime
"""
self.df['created_at'] = pd.to_datetime(self.df['created_at'], errors='coerce')
return df
def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
"""
convert columns like polarity, subjectivity, retweet_count
favorite_count etc to numbers
"""
self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
self.df['subjectivity'] = pd.to_numeric(self.df['subjectivity'],
errors='coerce')
self.df['listed_count'] = pd.to_numeric(self.df['listed_count'],
errors='coerce')
self.df['retweet_count'] = pd.to_numeric(self.df['retweet_count'],
errors='coerce')
self.df['friends_count'] = pd.to_numeric(self.df['friends_count'],
errors='coerce')
self.df['favorite_count'] = pd.to_numeric(self.df['favorite_count'],
errors='coerce')
self.df['statuses_count'] = pd.to_numeric(self.df['statuses_count'],
errors='coerce')
self.df['followers_count'] = pd.to_numeric(self.df['followers_count'],
errors='coerce')
self.df['polarity'] = pd.to_numeric(self.df['polarity'],
errors='coerce')
return df
def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame:
"""
remove non english tweets from lang
"""
self.df.query("lang == 'en'", inplace=True)
return df
def drop_nulls(self, df: pd.DataFrame) -> pd.DataFrame:
"""
drop nulls
"""
self.df = self.df.dropna(axis=0, how='any', inplace=False)
return df
def find_hashtags(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Method to find hashtags from tweets
This function will extract hashtags
"""
self.df = re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', df)
return df
def text_category(self, series: pd.Series) -> list:
"""
function that return positive, negative or neutral based on polarity
"""
polarities = []
for pol in series:
if pol >= 0.00000000001:
polarities.append("positive")
elif pol == 0.00000000000:
polarities.append("neutral")
elif pol <= -0.00000000001:
polarities.append("negative")
else:
polarities.append('UNK')
return polarities
def fill_missing(self, df: pd.DataFrame, column: str, value: str = 'unknown'):
"""
fill null values of a specific column with the provided value
"""
df[column] = df[column].fillna(value)
return df
def replace_empty_string(self, df:pd.DataFrame, column: str, value: str):
"""
replace empty strings in a specific column with the provided value
"""
df[column] = df[column].apply(lambda x: value if x == "" else x)
return df
def remove_characters(self, df: pd.DataFrame, column: str):
"""
removes non-alphanumeric characters with the exception of underscore hyphen and space
from the specified column
"""
df[column] = df[column].apply(lambda text: re.sub("[^a-zA-Z0-9\s_-]", "", text))
return df
def extract_device_name(self, df: pd.DataFrame) -> pd.Series:
"""
returns device name from source text
"""
df["source"] = df["source"].str.replace(r"(\s*\<.*?\>\s*)", " ", regex=True).str.strip()
# this works for a single row,where source a string
# res = re.split('<|>', source)[2].strip()
return df["source"]
if __name__ == "__main__":
"""
read the twitter dataset and Pass the data to the Clean_Tweets
class
"""
global_tweet_df = pd.read_json(global_data, lines=True)
global_cleaner = Clean_Tweets(global_tweet_df)
african_tweet_df = pd.read_json(african_data, lines=True)
african_cleaner = Clean_Tweets(african_tweet_df)