-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword2vec_gensim.py
More file actions
98 lines (85 loc) · 3.08 KB
/
word2vec_gensim.py
File metadata and controls
98 lines (85 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv, pickle, time
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from utils import GeneSeg
from gensim.models.word2vec import Word2Vec
learning_rate = 0.1
vocabulary_size = 3000
batch_size = 128
embedding_size = 128
num_skips = 4
skip_window = 5
num_sampled = 64
num_iter = 5
plot_only = 100
log_dir = "word2vec.log"
plt_dir = "file\\word2vec.png"
vec_dir = "file\\word2vec.pickle"
start = time.time()
words = []
datas = []
with open("data\\xssed.csv", "r", encoding="utf-8") as f:
reader = csv.DictReader(f, fieldnames=["payload"])
for row in reader:
payload = row["payload"]
word = GeneSeg(payload)
datas.append(word)
words += word
# 构建数据集
def build_dataset(datas, words):
count = [["UNK", -1]]
counter = Counter(words)
count.extend(counter.most_common(vocabulary_size - 1))
#List the n most common elements and their counts from the most common to the least.
# If n is None, then list all element counts.
vocabulary = [c[0] for c in count]# put out the first element in every list
data_set = []
for data in datas:
d_set = []
for word in data:
if word in vocabulary:
d_set.append(word)
else:
d_set.append("UNK")
count[0][1] += 1
data_set.append(d_set)
return data_set
data_set = build_dataset(datas, words)
# lauch the model for Word2Vec
model = Word2Vec(data_set, size=embedding_size, window=skip_window, negative=num_sampled, iter=num_iter)
embeddings = model.wv
def plot_with_labels(low_dim_embs, labels, filename=plt_dir):
plt.figure(figsize=(10, 10))
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2),
textcoords="offset points",
ha="right",
va="bottom")
f_text = "vocabulary_size=%d;batch_size=%d;embedding_size=%d;skip_window=%d;num_iter=%d" % (
vocabulary_size, batch_size, embedding_size, skip_window, num_iter
)
plt.figtext(0.03, 0.03, f_text, color="green", fontsize=10)
plt.show()
plt.savefig(filename)
tsne = TSNE(perplexity=30, n_components=2, init="pca", n_iter=5000)
plot_words = embeddings.index2word[:plot_only]
plot_embeddings = []
for word in plot_words:
plot_embeddings.append(embeddings[word])
low_dim_embs = tsne.fit_transform(plot_embeddings)
plot_with_labels(low_dim_embs, plot_words)
def save(embeddings):
dictionary = dict([(embeddings.index2word[i], i) for i in range(len(embeddings.index2word))])
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
word2vec = {"dictionary": dictionary, "embeddings": embeddings, "reverse_dictionary": reverse_dictionary}
with open(vec_dir, "wb") as f:
pickle.dump(word2vec, f)
save(embeddings)
end = time.time()
print("Over job in ", end - start)
print("Saved words vec to", vec_dir)