Skip to content

Commit 6e9df02

Browse files
Russell StewartRussell Stewart
authored andcommitted
Merge pull request #7 from ferhtaydn/feature/word2vec_distance_analogy
word2vec's distance and word-analogy scripts are ported to python for…
2 parents dae5ca3 + 9cc2c13 commit 6e9df02

File tree

2 files changed

+156
-0
lines changed

2 files changed

+156
-0
lines changed

eval/python/distance.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import argparse
2+
import numpy as np
3+
import sys
4+
5+
def generate():
6+
parser = argparse.ArgumentParser()
7+
parser.add_argument('--vocab_file', default='vocab.txt', type=str)
8+
parser.add_argument('--vectors_file', default='vectors.txt', type=str)
9+
args = parser.parse_args()
10+
11+
with open(args.vocab_file, 'r') as f:
12+
words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13+
with open(args.vectors_file, 'r') as f:
14+
vectors = {}
15+
for line in f:
16+
vals = line.rstrip().split(' ')
17+
vectors[vals[0]] = [float(x) for x in vals[1:]]
18+
19+
vocab_size = len(words)
20+
vocab = {w: idx for idx, w in enumerate(words)}
21+
ivocab = {idx: w for idx, w in enumerate(words)}
22+
23+
vector_dim = len(vectors[ivocab[0]])
24+
W = np.zeros((vocab_size, vector_dim))
25+
for word, v in vectors.items():
26+
if word == '<unk>':
27+
continue
28+
W[vocab[word], :] = v
29+
30+
# normalize each word vector to unit variance
31+
W_norm = np.zeros(W.shape)
32+
d = (np.sum(W ** 2, 1) ** (0.5))
33+
W_norm = (W.T / d).T
34+
return (W_norm, vocab, ivocab)
35+
36+
37+
def distance(W, vocab, ivocab, input_term):
38+
for idx, term in enumerate(input_term.split(' ')):
39+
if term in vocab:
40+
print('Word: %s Position in vocabulary: %i' % (term, vocab[term]))
41+
if idx == 0:
42+
vec_result = W[vocab[term], :]
43+
else:
44+
vec_result += W[vocab[term], :]
45+
else:
46+
print('Word: %s Out of dictionary!\n' % term)
47+
return
48+
49+
vec_norm = np.zeros(vec_result.shape)
50+
d = (np.sum(vec_result ** 2,) ** (0.5))
51+
vec_norm = (vec_result.T / d).T
52+
53+
dist = np.dot(W, vec_norm.T)
54+
55+
for term in input_term.split(' '):
56+
index = vocab[term]
57+
dist[index] = -np.Inf
58+
59+
a = np.argsort(-dist)[:N]
60+
61+
print("\n Word Cosine distance\n")
62+
print("---------------------------------------------------------\n")
63+
for x in a:
64+
print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
65+
66+
67+
if __name__ == "__main__":
68+
N = 100; # number of closest words that will be shown
69+
W, vocab, ivocab = generate()
70+
while True:
71+
input_term = raw_input("\nEnter word or sentence (EXIT to break): ")
72+
if input_term == 'EXIT':
73+
break
74+
else:
75+
distance(W, vocab, ivocab, input_term)
76+

eval/python/word_analogy.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import argparse
2+
import numpy as np
3+
import sys
4+
5+
def generate():
6+
parser = argparse.ArgumentParser()
7+
parser.add_argument('--vocab_file', default='vocab.txt', type=str)
8+
parser.add_argument('--vectors_file', default='vectors.txt', type=str)
9+
args = parser.parse_args()
10+
11+
with open(args.vocab_file, 'r') as f:
12+
words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13+
with open(args.vectors_file, 'r') as f:
14+
vectors = {}
15+
for line in f:
16+
vals = line.rstrip().split(' ')
17+
vectors[vals[0]] = [float(x) for x in vals[1:]]
18+
19+
vocab_size = len(words)
20+
vocab = {w: idx for idx, w in enumerate(words)}
21+
ivocab = {idx: w for idx, w in enumerate(words)}
22+
23+
vector_dim = len(vectors[ivocab[0]])
24+
W = np.zeros((vocab_size, vector_dim))
25+
for word, v in vectors.items():
26+
if word == '<unk>':
27+
continue
28+
W[vocab[word], :] = v
29+
30+
# normalize each word vector to unit variance
31+
W_norm = np.zeros(W.shape)
32+
d = (np.sum(W ** 2, 1) ** (0.5))
33+
W_norm = (W.T / d).T
34+
return (W_norm, vocab, ivocab)
35+
36+
37+
def distance(W, vocab, ivocab, input_term):
38+
vecs = {}
39+
if len(input_term.split(' ')) < 3:
40+
print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' ')))
41+
return
42+
else:
43+
for idx, term in enumerate(input_term.split(' ')):
44+
if term in vocab:
45+
print('Word: %s Position in vocabulary: %i' % (term, vocab[term]))
46+
vecs[idx] = W[vocab[term], :]
47+
else:
48+
print('Word: %s Out of dictionary!\n' % term)
49+
return
50+
51+
vec_result = vecs[1] - vecs[0] + vecs[2]
52+
53+
vec_norm = np.zeros(vec_result.shape)
54+
d = (np.sum(vec_result ** 2,) ** (0.5))
55+
vec_norm = (vec_result.T / d).T
56+
57+
dist = np.dot(W, vec_norm.T)
58+
59+
for term in input_term.split(' '):
60+
index = vocab[term]
61+
dist[index] = -np.Inf
62+
63+
a = np.argsort(-dist)[:N]
64+
65+
print("\n Word Cosine distance\n")
66+
print("---------------------------------------------------------\n")
67+
for x in a:
68+
print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
69+
70+
71+
if __name__ == "__main__":
72+
N = 100; # number of closest words that will be shown
73+
W, vocab, ivocab = generate()
74+
while True:
75+
input_term = raw_input("\nEnter three words (EXIT to break): ")
76+
if input_term == 'EXIT':
77+
break
78+
else:
79+
distance(W, vocab, ivocab, input_term)
80+

0 commit comments

Comments
 (0)