-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstep4_topic_modeling_nmf.py
More file actions
88 lines (64 loc) · 2.47 KB
/
step4_topic_modeling_nmf.py
File metadata and controls
88 lines (64 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 26 21:27:53 2017
#Reference: Prof. Gene Lee's codes from Dropbox code files. Modified...
"""
#Reference/Source: Prof. Gene Lee's codes from Dropbox code files.
import numpy as np # a conventional alias
from nltk.stem.lancaster import LancasterStemmer
ls = LancasterStemmer()
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn import decomposition
stopwords = nltk.corpus.stopwords.words('english')+['"rt','trump','donald','amp','uddf',
'"trump','"en','url','pbs','lang','null','"id','false','udc','trump"','spi','admin','https',
'thi',' trump','trump ','trump\"','trump\",','ud','udc','ude','uddf','lik','know']
tweets = []
infile = open('processed_tweets_10000.json', 'r')
content = infile.readlines()
fnl_tweets= []
for tweet in content:
tweet = tweet.lower()
cln=[]
for word in tweet.split():
if word not in stopwords and len(word) > 2 :
cln.append(ls.stem(word))
tweet_fnl=''
for word in cln:tweet_fnl+=' {}'.format(word)
fnl_tweets.append(tweet_fnl)
corpus = fnl_tweets
print 'num of documents, num of unique words'
vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
dtm = vectorizer.fit_transform(corpus)
print dtm.shape
print len(corpus)
vocab = vectorizer.get_feature_names() # list of unique vocab, we will use this later
print len(vocab), '# of unique words'
print vocab[-10:]
print vocab[:10]
num_topics = 10
clf = decomposition.NMF(n_components=num_topics, random_state=1)
doctopic = clf.fit_transform(dtm)
print num_topics, clf.reconstruction_err_
topic_words = []
num_top_words = 5
for topic in clf.components_:
#print topic.shape, topic[:5]
word_idx = np.argsort(topic)[::-1][0:num_top_words] # get indexes with highest weights
#print 'top indexes', word_idx
topic_words.append([vocab[i] for i in word_idx])
#print topic_words[-1]
#print
print ("**" * 10)
for t in range(len(topic_words)):
print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
from sklearn import decomposition
print dtm.shape
for n in range(1, 10):
num_topics = 5*n
num_top_words = 10
clf = decomposition.NMF(n_components=num_topics, random_state=1)
doctopic = clf.fit_transform(dtm)
print num_topics, clf.reconstruction_err_
#Reference/Source: Prof. Gene Lee's codes from Dropbox code files.