-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathembedding.py
More file actions
99 lines (86 loc) · 2.93 KB
/
embedding.py
File metadata and controls
99 lines (86 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import config
import os
from utils import load_object
def load_word_embedding():
print("Start to load word embeddings...")
word2vec = {}
with open(embed_file, "r", encoding="utf-8") as f:
for line in f.readlines():
vec = line.split()
word2vec[vec[0]] = np.array(list(map(float,vec[1:])))
print("Loaded %d word embeddings."%(len(word2vec)))
return word2vec
def load_word_embedding_model(model_name="bert"):
word2vec_file = "%s_embedding.obj"%(model_name)
word2vec = load_object(os.path.join(config.word_embed_path, word2vec_file))
return word2vec
class Vocab():
PAD, START, END, UNK = 0, 1, 2, 3
def __init__(self):
self._id2label = ["Normal","Anomalous"]
self._label2id = {k:v for k,v in enumerate(self._id2label)}
self._id2word = []
self._word2id = {}
self._embed_dim = -1
self.embeddings = None
def feed(self, word2vec, force_replace=False):
if force_replace==True:
self._id2word = []
self._word2id = {}
self._embed_dim = -1
for special_word in ['<pad>', '<bos>', '<eos>', '<oov>']:
if special_word not in self._word2id:
self._word2id[special_word] = len(self._word2id)
self._id2word.append(special_word)
for word, embed in word2vec.items():
if self._embed_dim == -1:
self._embed_dim = embed.shape[0]
config.embedding_dim = self._embed_dim
if word not in self._word2id:
self._word2id[word] = len(self._word2id)
self._id2word.append(word)
word_num = len(self._id2word)
print('Number of words: %d, Dimension of embeddings: %d.'%(len(self._id2word), self._embed_dim))
self.embeddings = np.zeros([word_num, self._embed_dim])
for word, embed in word2vec.items():
index = self._word2id.get(word)
vector = np.array(embed, dtype=float)
self.embeddings[index] = vector
self.embeddings[self.UNK] += vector
self.embeddings[self.UNK] /= max(word_num, 1)
if config.use_normalization==True:
avg_dim_std = 0
for k in range(self._embed_dim):
dim_std = np.std(self.embeddings[:, k])
avg_dim_std += dim_std
avg_dim_std = max(avg_dim_std/self._embed_dim, 1e-8)
self.embeddings /= avg_dim_std
return self
def __contains__(self, x):
return x in self._word2id
def word2id(self, xs):
if isinstance(xs, list):
return [self._word2id.get(x, self.UNK) for x in xs]
return self._word2id.get(xs, self.UNK)
def id2word(self, xs):
if isinstance(xs, list):
return [self._id2word[x] for x in xs]
return self._id2word[xs]
def label2id(self, xs):
if isinstance(xs, list):
return [self._label2id.get(x) for x in xs]
return self._label2id.get(xs)
def id2label(self, xs):
if isinstance(xs, list):
return [self._id2label[x] for x in xs]
return self._id2label[xs]
@property
def vocab_size(self):
return len(self._id2word)
@property
def label_size(self):
return len(self._id2label)
@property
def word_dim(self):
return self._embed_dim