-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_tweet.py
More file actions
138 lines (108 loc) · 3.76 KB
/
process_tweet.py
File metadata and controls
138 lines (108 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
from langdetect import detect
import csv
import codecs
import nltk
#def filterEnglish(lists): #filters out words that are not in English
# return [tweet for tweet in lists if detect(tweet) == 'en']
featureList = []
def processTweet(tweet):
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def handleRepeatedLetters(s):
#look for 2 or more repetitions of character and replace with 1 repetition of the character itself
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def stopWords(file):
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(file, 'r')
for line in fp.readline():
word = line.strip()
stopWords.append(word)
fp.close()
return stopWords
def getFeatureVector(tweet):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#replace more than two with two occurrences
w = handleRepeatedLetters(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word stats with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(e%s)' % word] = (word in tweet_words)
return features
#def bulkFeatureExtraction(fileName):
#file = file.decode('utf-8')
stopwordsList = []
tweets = []
with codecs.open(fileName, encoding = 'utf-8', errors = 'replace') as f:
reader = csv.reader(f)
featureList = []
#your_list = list(reader)
for row in reader:
tweet = row[1]
sentiment = row[0]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
featureList.append(featureVector)
tweets.append((featureVector, sentiment))
#inpTweets = csv.reader(open(file, 'r'))
#stopWordsList = stopWords('stopwords.txt')
#inpTweets = list(inpTweets)
#[tweet.decode('utf-8') for tweet in inpTweets]
# Remove featureList duplicates
featureList = list(set(featureList))
# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)
#print(training_set)
return training_set
def addTweetsToDict(fileName):
tweets_dictionary = {} #HashMap<String, Integer>
with open(file, 'r') as f:
reader = csv.reader(f)
for row in reader:
tweet = row[1]
sentiment = row[0]
tweets_dictionary[tweet] = sentiment
return tweets_dictionary
'''def plot():
labels = 'Positive', 'Negative'
sizes = [.34, .66]
colors = ['yellowgreen', 'mediumpurple', 'lightskyblue', 'lightcoral']
explode = (0, 0.1) # proportion with which to offset each wedge
plt.pie(sizes, # data
explode=explode, # offset parameters
labels=labels, # slice labels
colors=colors, # array of colours
autopct='%1.1f%%', # print the values inside the wedges
shadow=True, enable shadow
startangle=70 # starting angle
)
plt.axis('equal')
plt.savefig('trump.png')'''