13
13
14
14
import h5py
15
15
import numpy as np
16
- from scipy .special import polygamma as pg
17
16
18
17
from cusim import aux , IoUtils
19
- from cusim .culda . culda_bind import CuLDABind
20
- from cusim .config_pb2 import CuLDAConfigProto
18
+ from cusim .cuw2v . cuw2v_bind import CuW2VBind
19
+ from cusim .config_pb2 import CuW2VConfigProto
21
20
22
21
EPS = 1e-10
23
22
24
- class CuLDA :
23
+ class CuW2V :
25
24
def __init__ (self , opt = None ):
26
- self .opt = aux .get_opt_as_proto (opt or {}, CuLDAConfigProto )
25
+ self .opt = aux .get_opt_as_proto (opt or {}, CuW2VConfigProto )
27
26
self .logger = aux .get_logger ("culda" , level = self .opt .py_log_level )
28
27
29
28
tmp = tempfile .NamedTemporaryFile (mode = 'w' , delete = False )
@@ -32,13 +31,13 @@ def __init__(self, opt=None):
32
31
tmp .close ()
33
32
34
33
self .logger .info ("opt: %s" , opt_content )
35
- self .obj = CuLDABind ()
34
+ self .obj = CuW2VBind ()
36
35
assert self .obj .init (bytes (tmp .name , "utf8" )), f"failed to load { tmp .name } "
37
36
os .remove (tmp .name )
38
37
39
- self .words , self .num_words , self .num_docs = None , None , None
40
- self .alpha , self .beta , self .grad_alpha , self .new_beta = \
38
+ self .words , self .word_count , self .num_words , self .num_docs = \
41
39
None , None , None , None
40
+ self .emb_in , self .emb_out = None , None
42
41
43
42
def preprocess_data (self ):
44
43
if self .opt .skip_preprocess :
@@ -52,9 +51,15 @@ def preprocess_data(self):
52
51
def init_model (self ):
53
52
# load voca
54
53
data_dir = self .opt .processed_data_dir
55
- self .logger .info ("load key from %s" , pjoin (data_dir , "keys.txt" ))
56
- with open (pjoin (data_dir , "keys.txt" ), "rb" ) as fin :
54
+ keys_path = pjoin (data_dir , "keys.txt" )
55
+ count_path = pjoin (data_dir , "count.txt" )
56
+ self .logger .info ("load key, count from %s, %s" , keys_path , count_path )
57
+ with open (keys_path , "rb" ) as fin :
57
58
self .words = [line .strip () for line in fin ]
59
+ with open (count_path , "rb" ) as fin :
60
+ self .word_count = np .array ([float (line .strip ()) for line in fin ],
61
+ dtype = np .float32 )
62
+ self .word_count = np .power (self .word_count , self .opt .count_power )
58
63
self .num_words = len (self .words )
59
64
60
65
# count number of docs
@@ -67,40 +72,33 @@ def init_model(self):
67
72
68
73
# random initialize alpha and beta
69
74
np .random .seed (self .opt .seed )
70
- self .alpha = np .random .uniform ( \
71
- size = (self .opt .num_topics ,)).astype (np .float32 )
72
- self .beta = np .random .uniform ( \
73
- size = (self .num_words , self .opt .num_topics )).astype (np .float32 )
74
- self .beta /= np .sum (self .beta , axis = 0 )[None , :]
75
- self .logger .info ("alpha %s, beta %s initialized" ,
76
- self .alpha .shape , self .beta .shape )
77
-
78
- # zero initialize grad alpha and new beta
79
- block_cnt = self .obj .get_block_cnt ()
80
- self .grad_alpha = np .zeros (shape = (block_cnt , self .opt .num_topics ),
81
- dtype = np .float32 )
82
- self .new_beta = np .zeros (shape = self .beta .shape , dtype = np .float32 )
83
- self .logger .info ("grad alpha %s, new beta %s initialized" ,
84
- self .grad_alpha .shape , self .new_beta .shape )
75
+ self .emb_in = np .random .normal ( \
76
+ size = (self .num_words , self .opt .num_dims )).astype (np .float32 )
77
+ out_words = self .num_words if self .opt .neg else self .num_words - 1
78
+ self .emb_out = np .random .uniform ( \
79
+ size = (out_words , self .opt .num_dims )).astype (np .float32 )
80
+ self .logger .info ("emb_in %s, emb_out %s initialized" ,
81
+ self .emb_in .shape , self .emb_out .shape )
85
82
86
83
# push it to gpu
87
- self .obj .load_model (self .alpha , self .beta , self . grad_alpha , self . new_beta )
84
+ self .obj .load_model (self .emb_in , self .emb_out )
88
85
89
86
def train_model (self ):
90
87
self .preprocess_data ()
91
88
self .init_model ()
89
+ if not self .opt .neg :
90
+ self .obj .build_huffman_tree (self .word_count )
92
91
h5f = h5py .File (pjoin (self .opt .processed_data_dir , "token.h5" ), "r" )
93
92
for epoch in range (1 , self .opt .epochs + 1 ):
94
93
self .logger .info ("Epoch %d / %d" , epoch , self .opt .epochs )
95
- self ._train_e_step (h5f )
96
- self ._train_m_step ()
94
+ self ._train_epoch (h5f )
95
+ self .pull ()
97
96
h5f .close ()
98
97
99
- def _train_e_step (self , h5f ):
98
+ def _train_epoch (self , h5f ):
100
99
offset , size = 0 , h5f ["cols" ].shape [0 ]
101
- pbar = aux .Progbar (size , stateful_metrics = ["train_loss" , "vali_loss" ])
102
- train_loss_nume , train_loss_deno = 0 , 0
103
- vali_loss_nume , vali_loss_deno = 0 , 0
100
+ pbar = aux .Progbar (size , stateful_metrics = ["loss" ])
101
+ loss_nume , loss_deno = 0 , 0
104
102
while True :
105
103
target = h5f ["indptr" ][offset ] + self .opt .batch_size
106
104
if target < size :
@@ -111,57 +109,21 @@ def _train_e_step(self, h5f):
111
109
beg , end = indptr [0 ], indptr [- 1 ]
112
110
indptr -= beg
113
111
cols = h5f ["cols" ][beg :end ]
114
- vali = (h5f ["vali" ][beg :end ] < self .opt .vali_p ).astype (np .bool )
115
112
offset = next_offset
116
113
117
114
# call cuda kernel
118
- train_loss , vali_loss = \
119
- self .obj .feed_data (cols , indptr , vali , self .opt .num_iters_in_e_step )
115
+ if self .opt .neg :
116
+ self .obj .build_random_table ( \
117
+ self .word_count , self .opt .random_size , self .opt .num_threads )
118
+ _loss_nume , _loss_deno = \
119
+ self .obj .feed_data (cols , indptr )
120
120
121
121
# accumulate loss
122
- train_loss_nume -= train_loss
123
- vali_loss_nume -= vali_loss
124
- vali_cnt = np .count_nonzero (vali )
125
- train_cnt = len (vali ) - vali_cnt
126
- train_loss_deno += train_cnt
127
- vali_loss_deno += vali_cnt
128
- train_loss = train_loss_nume / (train_loss_deno + EPS )
129
- vali_loss = vali_loss_nume / (vali_loss_deno + EPS )
122
+ loss_nume += _loss_nume
123
+ loss_deno += _loss_deno
124
+ loss = loss_nume / (loss_deno + EPS )
130
125
131
126
# update progress bar
132
- pbar .update (end , values = [("train_loss" , train_loss ),
133
- ("vali_loss" , vali_loss )])
127
+ pbar .update (end , values = [("loss" , loss )])
134
128
if end == size :
135
129
break
136
-
137
- def _train_m_step (self ):
138
- self .obj .pull ()
139
-
140
- # update beta
141
- self .new_beta [:, :] = np .maximum (self .new_beta , EPS )
142
- self .beta [:, :] = self .new_beta / np .sum (self .new_beta , axis = 0 )[None , :]
143
- self .new_beta [:, :] = 0
144
-
145
- # update alpha
146
- alpha_sum = np .sum (self .alpha )
147
- gvec = np .sum (self .grad_alpha , axis = 0 )
148
- gvec += self .num_docs * (pg (0 , alpha_sum ) - pg (0 , self .alpha ))
149
- hvec = self .num_docs * pg (1 , self .alpha )
150
- z_0 = pg (1 , alpha_sum )
151
- c_nume = np .sum (gvec / hvec )
152
- c_deno = 1 / z_0 + np .sum (1 / hvec )
153
- c_0 = c_nume / c_deno
154
- delta = (gvec - c_0 ) / hvec
155
- self .alpha -= delta
156
- self .alpha [:] = np .maximum (self .alpha , EPS )
157
- self .grad_alpha [:,:] = 0
158
-
159
- self .obj .push ()
160
-
161
- def save_model (self , model_path ):
162
- self .logger .info ("save model path: %s" , model_path )
163
- h5f = h5py .File (model_path , "w" )
164
- h5f .create_dataset ("alpha" , data = self .alpha )
165
- h5f .create_dataset ("beta" , data = self .beta )
166
- h5f .create_dataset ("keys" , data = np .array (self .words ))
167
- h5f .close ()
0 commit comments