Live-Twitter-Sentiment-Analysis/Classifier_Trainer.py at master · sarthak0797/Live-Twitter-Sentiment-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews,stopwords
import pickle
import gzip
import random
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import string


document = []
short_pos = open("positive.txt","r").read()
short_neg = open("negative.txt","r").read()


for r in short_pos.split('\n'):
    document.append( (r, "pos") )

for r in short_neg.split('\n'):
    document.append( (r, "neg") )

save_doc = open("pickled_algos/documents.pickle", "wb")
pickle.dump(document, save_doc)
save_doc.close()


all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

punctuation = list(string.punctuation)

stop_words = set(punctuation)

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

save_doc = open("pickled_algos/word_features.pickle", "wb")
pickle.dump(word_features,save_doc)
save_doc.close()

def find(documents):
    temp = {}
    word = word_tokenize(documents)
    n = 2
    score_fn=BigramAssocMeasures.chi_sq

    bigram_finder = BigramCollocationFinder.from_words(word)
    bigrams = bigram_finder.nbest(score_fn, n)

    for w in word_features:
        temp[w] = (w in word )
    for w in bigrams:
        temp[w] = True

    return temp

features = [(find(w) , ids) for (w , ids) in document]

training_set = features[:4600] + features[5000:10300]
testing_set = features[4600:5000] + features[10300:]


classifier = NaiveBayesClassifier.train(training_set)
classify_buffer = open("pickled_algos/classifier.pickle", 'wb')
pickle.dump(classifier, classify_buffer)
classify_buffer.close()

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)