|
| 1 | +import argparse |
| 2 | +import numpy as np |
| 3 | +import sys |
| 4 | + |
| 5 | +def generate(): |
| 6 | + parser = argparse.ArgumentParser() |
| 7 | + parser.add_argument('--vocab_file', default='vocab.txt', type=str) |
| 8 | + parser.add_argument('--vectors_file', default='vectors.txt', type=str) |
| 9 | + args = parser.parse_args() |
| 10 | + |
| 11 | + with open(args.vocab_file, 'r') as f: |
| 12 | + words = [x.rstrip().split(' ')[0] for x in f.readlines()] |
| 13 | + with open(args.vectors_file, 'r') as f: |
| 14 | + vectors = {} |
| 15 | + for line in f: |
| 16 | + vals = line.rstrip().split(' ') |
| 17 | + vectors[vals[0]] = [float(x) for x in vals[1:]] |
| 18 | + |
| 19 | + vocab_size = len(words) |
| 20 | + vocab = {w: idx for idx, w in enumerate(words)} |
| 21 | + ivocab = {idx: w for idx, w in enumerate(words)} |
| 22 | + |
| 23 | + vector_dim = len(vectors[ivocab[0]]) |
| 24 | + W = np.zeros((vocab_size, vector_dim)) |
| 25 | + for word, v in vectors.items(): |
| 26 | + if word == '<unk>': |
| 27 | + continue |
| 28 | + W[vocab[word], :] = v |
| 29 | + |
| 30 | + # normalize each word vector to unit variance |
| 31 | + W_norm = np.zeros(W.shape) |
| 32 | + d = (np.sum(W ** 2, 1) ** (0.5)) |
| 33 | + W_norm = (W.T / d).T |
| 34 | + return (W_norm, vocab, ivocab) |
| 35 | + |
| 36 | + |
| 37 | +def distance(W, vocab, ivocab, input_term): |
| 38 | + vecs = {} |
| 39 | + if len(input_term.split(' ')) < 3: |
| 40 | + print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' '))) |
| 41 | + return |
| 42 | + else: |
| 43 | + for idx, term in enumerate(input_term.split(' ')): |
| 44 | + if term in vocab: |
| 45 | + print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) |
| 46 | + vecs[idx] = W[vocab[term], :] |
| 47 | + else: |
| 48 | + print('Word: %s Out of dictionary!\n' % term) |
| 49 | + return |
| 50 | + |
| 51 | + vec_result = vecs[1] - vecs[0] + vecs[2] |
| 52 | + |
| 53 | + vec_norm = np.zeros(vec_result.shape) |
| 54 | + d = (np.sum(vec_result ** 2,) ** (0.5)) |
| 55 | + vec_norm = (vec_result.T / d).T |
| 56 | + |
| 57 | + dist = np.dot(W, vec_norm.T) |
| 58 | + |
| 59 | + for term in input_term.split(' '): |
| 60 | + index = vocab[term] |
| 61 | + dist[index] = -np.Inf |
| 62 | + |
| 63 | + a = np.argsort(-dist)[:N] |
| 64 | + |
| 65 | + print("\n Word Cosine distance\n") |
| 66 | + print("---------------------------------------------------------\n") |
| 67 | + for x in a: |
| 68 | + print("%35s\t\t%f\n" % (ivocab[x], dist[x])) |
| 69 | + |
| 70 | + |
| 71 | +if __name__ == "__main__": |
| 72 | + N = 100; # number of closest words that will be shown |
| 73 | + W, vocab, ivocab = generate() |
| 74 | + while True: |
| 75 | + input_term = raw_input("\nEnter three words (EXIT to break): ") |
| 76 | + if input_term == 'EXIT': |
| 77 | + break |
| 78 | + else: |
| 79 | + distance(W, vocab, ivocab, input_term) |
| 80 | + |
0 commit comments