-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path01_Word2Vec_Demo.py
More file actions
167 lines (136 loc) · 5.98 KB
/
01_Word2Vec_Demo.py
File metadata and controls
167 lines (136 loc) · 5.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
01_Word2Vec_Demo.py — Word Embedding Training & Visualization
Demonstrates:
1. Training Word2Vec (Skip-gram) on a small corpus using Gensim
2. Exploring word similarity and analogy (king - man + woman = queen)
3. Visualizing embeddings in 2D via t-SNE
Prerequisites: pip install gensim matplotlib scikit-learn
"""
import numpy as np
# ---------------------------------------------------------------------------
# 1. Corpus & Training
# ---------------------------------------------------------------------------
# Small demo corpus — in production, use Wikipedia / BookCorpus
corpus = [
"the king ruled the kingdom with wisdom",
"the queen ruled the kingdom with grace",
"the prince trained with the knight in the castle",
"the princess studied language and science",
"a man and a woman walked through the village",
"the knight defended the castle from invaders",
"wisdom and grace are virtues of a good ruler",
"the kingdom prospered under fair rule",
"science and language open doors to knowledge",
"the village celebrated the harvest festival",
"a king and queen attended the royal banquet",
"the prince and princess explored the forest",
"knowledge is power said the wise scholar",
"the castle stood tall above the green valley",
"the knight rode his horse across the kingdom",
]
tokenized = [sentence.split() for sentence in corpus]
try:
from gensim.models import Word2Vec
model = Word2Vec(
sentences=tokenized,
vector_size=50, # embedding dimension
window=3, # context window
min_count=1, # include all words
sg=1, # 1 = Skip-gram, 0 = CBOW
epochs=200, # more epochs for small corpus
seed=42,
)
# ------------------------------------------------------------------
# 2. Similarity & Analogy
# ------------------------------------------------------------------
print("=" * 60)
print("Word2Vec Demo — Similarity & Analogy")
print("=" * 60)
print("\n[Most similar to 'king']:")
for word, score in model.wv.most_similar("king", topn=5):
print(f" {word:15s} {score:.4f}")
print("\n[Most similar to 'castle']:")
for word, score in model.wv.most_similar("castle", topn=5):
print(f" {word:15s} {score:.4f}")
# Analogy: king - man + woman ≈ queen
print("\n[Analogy] king - man + woman = ?")
try:
results = model.wv.most_similar(
positive=["king", "woman"], negative=["man"], topn=3
)
for word, score in results:
print(f" {word:15s} {score:.4f}")
except KeyError as e:
print(f" Word not in vocabulary: {e}")
# ------------------------------------------------------------------
# 3. t-SNE Visualization
# ------------------------------------------------------------------
try:
import matplotlib
matplotlib.use("Agg") # non-interactive backend
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
words = list(model.wv.key_to_index.keys())
vectors = np.array([model.wv[w] for w in words])
tsne = TSNE(n_components=2, random_state=42, perplexity=min(5, len(words) - 1))
coords = tsne.fit_transform(vectors)
plt.figure(figsize=(12, 8))
plt.scatter(coords[:, 0], coords[:, 1], s=40, alpha=0.6)
for i, word in enumerate(words):
plt.annotate(word, (coords[i, 0], coords[i, 1]), fontsize=9)
plt.title("Word2Vec Embeddings — t-SNE Projection")
plt.tight_layout()
out_path = "word2vec_tsne.png"
plt.savefig(out_path, dpi=150)
print(f"\n[Visualization saved to {out_path}]")
except ImportError:
print("\n[Skipping visualization — install matplotlib & scikit-learn]")
except ImportError:
print("Gensim not installed. Run: pip install gensim")
print("\nFalling back to manual Skip-gram illustration...\n")
# ------------------------------------------------------------------
# Fallback: Pure NumPy Skip-gram (educational)
# ------------------------------------------------------------------
vocab = sorted(set(w for s in tokenized for w in s))
w2i = {w: i for i, w in enumerate(vocab)}
V = len(vocab)
D = 10 # small dimension for demo
np.random.seed(42)
W_in = np.random.randn(V, D) * 0.1 # input embeddings
W_out = np.random.randn(D, V) * 0.1 # output embeddings
def softmax(x):
e = np.exp(x - x.max())
return e / e.sum()
lr = 0.05
window = 2
for epoch in range(50):
loss = 0.0
for sentence in tokenized:
for i, target in enumerate(sentence):
for j in range(max(0, i - window), min(len(sentence), i + window + 1)):
if i == j:
continue
context = sentence[j]
t_idx = w2i[target]
c_idx = w2i[context]
hidden = W_in[t_idx] # (D,)
scores = hidden @ W_out # (V,)
probs = softmax(scores) # (V,)
loss -= np.log(probs[c_idx] + 1e-9)
# Gradient
grad_out = probs.copy()
grad_out[c_idx] -= 1.0 # (V,)
W_out -= lr * np.outer(hidden, grad_out)
W_in[t_idx] -= lr * (W_out @ grad_out)
if (epoch + 1) % 10 == 0:
print(f" Epoch {epoch+1:3d} Loss: {loss:.2f}")
# Cosine similarity
def cosine(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)
print("\n[Cosine similarities from pure NumPy Skip-gram]:")
for pair in [("king", "queen"), ("king", "castle"), ("man", "woman")]:
if pair[0] in w2i and pair[1] in w2i:
sim = cosine(W_in[w2i[pair[0]]], W_in[w2i[pair[1]]])
print(f" {pair[0]:10s} ↔ {pair[1]:10s} sim = {sim:.4f}")
if __name__ == "__main__":
pass