-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathp16-course-clustering.py
More file actions
70 lines (54 loc) · 1.88 KB
/
p16-course-clustering.py
File metadata and controls
70 lines (54 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plot
from sklearn.cluster import KMeans, AgglomerativeClustering
def course_to_level(num: int) -> int:
if num < 200:
return 1
elif num < 300:
return 2
elif num < 400:
return 3
elif num < 500:
return 4
elif num < 600:
return 5
elif num >= 1000:
return 1
else:
return 5
df = pd.read_json("data/midd_cs_courses.jsonl", lines=True)
print(df[["number", "title"]])
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.description).toarray()
numbers = df.number.array.to_numpy()
levels = [course_to_level(n) for n in df.number.to_list()]
## TODO: improve this visualization of CS courses.
#
## IDEAS: edges
# Create edges between courses in the same cluster.
# Or are pre-requisites. (number mentioned in text?)
# 'plot([x1,x2], [y1,y2])' a line...
## IDEAS: compare PCA to TSNE
# PCA doesn't have a perplexity parameter.
# What does TSNE do better on this dataset?
## IDEAS: kmeans
# Create a kmeans clustering of the courses.
# Then apply colors based on the kmeans-prediction to the below t-sne graph.
perplexity = 15
viz = TSNE(perplexity=perplexity, random_state=42)
V = viz.fit_transform(X)
# Right now, let's assign colors to our class-nodes based on their number.
color_values = levels # TODO swap this.
plot.title("T-SNE(Courses), perplexity={}".format(perplexity))
plot.scatter(V[:, 0], V[:, 1], alpha=1, s=10, c=color_values, cmap="turbo")
# Annotate the scattered points with their course number.
for i in range(len(numbers)):
course_num = str(numbers[i])
x = V[i, 0]
y = V[i, 1]
plot.annotate(course_num, (x, y))
plot.savefig("graphs/p16-tsne-courses-p{}.png".format(perplexity))
plot.show()