-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproject_Solutions.txt
More file actions
131 lines (90 loc) · 3.17 KB
/
project_Solutions.txt
File metadata and controls
131 lines (90 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
Task 1
# Importing modules
import pandas as pd
papers = pd.read_csv('datasets/papers.csv')
print(papers.head())
Task 2
# Remove the columns
papers.drop(['id', 'event_type', 'pdf_name'],axis=1,inplace=True)
print(papers.head())
Task 3
# Group the papers by year
groups = papers.groupby('year')
# Determine the size of each group
counts = groups.size()
# Visualise the counts as a bar plot
import matplotlib.pyplot
%matplotlib inline
counts.plot.bar()
Task 4
https://docs.python.org/3/library/re.html
# Load the regular expression library
import re
# Print the titles of the first rows
print(papers['title'].head())
# Remove punctuation
papers['title_processed'] = papers['title'].map(lambda x: re.sub('[,\.!?]', '', x))
papers['title_processed'] = papers['title_processed'].str.lower()
print(papers['title_processed'].head())
Task 5
http://amueller.github.io/word_cloud/
https://github.com/amueller/word_cloud
# Import the wordcloud library
import wordcloud
# Join the different processed titles together.
long_string = ''.join(papers['title_processed'].str.cat(sep=' '))
# Create a WordCloud object
wordcloud = wordcloud.WordCloud()
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
Task 6
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
import matplotlib.pyplot as plt
words = count_vectorizer.get_feature_names()
total_counts = np.zeros(len(words))
for t in count_data:
total_counts+=t.toarray()[0]
count_dict = (zip(words, total_counts))
count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
words = [w[0] for w in count_dict]
counts = [w[1] for w in count_dict]
x_pos = np.arange(len(words))
plt.bar(x_pos, counts,align='center')
plt.xticks(x_pos, words, rotation=90)
plt.xlabel('words')
plt.ylabel('counts')
plt.title('10 most common words')
plt.show()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(papers['title'])
# Visualise the 10 most common words
plot_10_most_common_words(count_data,count_vectorizer)
Task 7
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
# Tweak the two parameters below (use int values below 15)
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)