Twitter-Sentiment-Analysis/process_tweet.py at master · oikobill/Twitter-Sentiment-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
from langdetect import detect
import csv
import codecs
import nltk


#def filterEnglish(lists): #filters out words that are not in English
#	return [tweet for tweet in lists if detect(tweet) == 'en']
featureList = []

def processTweet(tweet):
	#Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet


def handleRepeatedLetters(s):
    #look for 2 or more repetitions of character and replace with 1 repetition of the character itself
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

def stopWords(file):
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')
    fp = open(file, 'r')
    for line in fp.readline():
        word = line.strip()
        stopWords.append(word)
    fp.close()
    return stopWords

def getFeatureVector(tweet):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace more than two with two occurrences
        w = handleRepeatedLetters(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector


def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(e%s)' % word] = (word in tweet_words)
    return features

#def bulkFeatureExtraction(fileName):
    #file = file.decode('utf-8')

    stopwordsList = []
    tweets = []

    with codecs.open(fileName, encoding = 'utf-8', errors = 'replace') as f:
        reader = csv.reader(f)
        featureList = []
        #your_list = list(reader)
        for row in reader:
            tweet = row[1]
            sentiment = row[0]
            processedTweet = processTweet(tweet)
            featureVector = getFeatureVector(processedTweet)
            featureList.append(featureVector)
            tweets.append((featureVector, sentiment))


    #inpTweets = csv.reader(open(file, 'r'))
    #stopWordsList = stopWords('stopwords.txt')


    #inpTweets = list(inpTweets)
    #[tweet.decode('utf-8') for tweet in inpTweets]


	# Remove featureList duplicates
    featureList = list(set(featureList))

	# Extract feature vector for all tweets in one shote
    training_set = nltk.classify.util.apply_features(extract_features, tweets)
    #print(training_set)

    return training_set


def addTweetsToDict(fileName):

    tweets_dictionary = {} #HashMap<String, Integer>

    with open(file, 'r') as f:
        reader = csv.reader(f)

        for row in reader:
            tweet = row[1]
            sentiment = row[0]
            tweets_dictionary[tweet] = sentiment

    return tweets_dictionary


'''def plot():
	labels = 'Positive', 'Negative'
	sizes = [.34, .66]
	colors = ['yellowgreen', 'mediumpurple', 'lightskyblue', 'lightcoral']
	explode = (0, 0.1)    # proportion with which to offset each wedge

	plt.pie(sizes,              # data
	        explode=explode,    # offset parameters
	        labels=labels,      # slice labels
	        colors=colors,      # array of colours
	        autopct='%1.1f%%',  # print the values inside the wedges
	        shadow=True, enable shadow
	        startangle=70       # starting angle
	        )
	plt.axis('equal')
	plt.savefig('trump.png')'''