Skip to content

Commit b805ed0

Browse files
committed
first draft
1 parent 882ce30 commit b805ed0

File tree

9 files changed

+106
-91
lines changed

9 files changed

+106
-91
lines changed

cpp/include/utils/ioutils.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class IoUtils {
3333
int LoadStreamFile(std::string filepath);
3434
std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
3535
std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
36-
void GetWordVocab(int min_count, std::string keys_path);
36+
void GetWordVocab(int min_count, std::string keys_path, std::string count_path);
3737
void GetToken(int* rows, int* cols, int* indptr);
3838
private:
3939
void ParseLine(std::string line, std::vector<std::string>& line_vec);

cpp/src/cuw2v/cuw2v.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ bool CuW2V::Init(std::string opt_path) {
4444
lr_ = opt_["lr"].number_value();
4545

4646
// if zero, we will use hierarchical softmax
47-
neg_ = opt_["negative_sampling"].int_value();
47+
neg_ = opt_["neg"].int_value();
4848

4949
// random seed
5050
table_seed_ = opt_["table_seed"].int_value();

cpp/src/utils/ioutils.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
153153
return {read_lines, word_count_.size()};
154154
}
155155

156-
void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
156+
void IoUtils::GetWordVocab(int min_count, std::string keys_path, std::string count_path) {
157157
INFO("number of raw words: {}", word_count_.size());
158158
word_idmap_.clear(); word_list_.clear();
159159
for (auto& it: word_count_) {
@@ -164,13 +164,16 @@ void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
164164
}
165165
INFO("number of words after filtering: {}", word_list_.size());
166166

167-
// write keys to csv file
168-
std::ofstream fout(keys_path.c_str());
167+
// write keys and count to csv file
168+
std::ofstream fout1(keys_path.c_str());
169+
std::ofstream fout2(count_path.c_str());
169170
INFO("dump keys to {}", keys_path);
170171
int n = word_list_.size();
171172
for (int i = 0; i < n; ++i) {
172173
std::string line = word_list_[i] + "\n";
173-
fout.write(line.c_str(), line.size());
174+
fout1.write(line.c_str(), line.size());
175+
line = std::to_string(word_count_[word_list_[i]]) + "\n";
176+
fout2.write(line.c_str(), line.size());
174177
}
175178
fout.close();
176179
}

cusim/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
# LICENSE file in the root directory of this source tree.
66
from cusim.ioutils import IoUtils
77
from cusim.culda import CuLDA
8+
from cusim.cuw2v import CuW2V

cusim/cuw2v/pycuw2v.py

Lines changed: 39 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,16 @@
1313

1414
import h5py
1515
import numpy as np
16-
from scipy.special import polygamma as pg
1716

1817
from cusim import aux, IoUtils
19-
from cusim.culda.culda_bind import CuLDABind
20-
from cusim.config_pb2 import CuLDAConfigProto
18+
from cusim.cuw2v.cuw2v_bind import CuW2VBind
19+
from cusim.config_pb2 import CuW2VConfigProto
2120

2221
EPS = 1e-10
2322

24-
class CuLDA:
23+
class CuW2V:
2524
def __init__(self, opt=None):
26-
self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
25+
self.opt = aux.get_opt_as_proto(opt or {}, CuW2VConfigProto)
2726
self.logger = aux.get_logger("culda", level=self.opt.py_log_level)
2827

2928
tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
@@ -32,13 +31,13 @@ def __init__(self, opt=None):
3231
tmp.close()
3332

3433
self.logger.info("opt: %s", opt_content)
35-
self.obj = CuLDABind()
34+
self.obj = CuW2VBind()
3635
assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
3736
os.remove(tmp.name)
3837

39-
self.words, self.num_words, self.num_docs = None, None, None
40-
self.alpha, self.beta, self.grad_alpha, self.new_beta = \
38+
self.words, self.word_count, self.num_words, self.num_docs = \
4139
None, None, None, None
40+
self.emb_in, self.emb_out = None, None
4241

4342
def preprocess_data(self):
4443
if self.opt.skip_preprocess:
@@ -52,9 +51,15 @@ def preprocess_data(self):
5251
def init_model(self):
5352
# load voca
5453
data_dir = self.opt.processed_data_dir
55-
self.logger.info("load key from %s", pjoin(data_dir, "keys.txt"))
56-
with open(pjoin(data_dir, "keys.txt"), "rb") as fin:
54+
keys_path = pjoin(data_dir, "keys.txt")
55+
count_path = pjoin(data_dir, "count.txt")
56+
self.logger.info("load key, count from %s, %s", keys_path, count_path)
57+
with open(keys_path, "rb") as fin:
5758
self.words = [line.strip() for line in fin]
59+
with open(count_path, "rb") as fin:
60+
self.word_count = np.array([float(line.strip()) for line in fin],
61+
dtype=np.float32)
62+
self.word_count = np.power(self.word_count, self.opt.count_power)
5863
self.num_words = len(self.words)
5964

6065
# count number of docs
@@ -67,40 +72,33 @@ def init_model(self):
6772

6873
# random initialize alpha and beta
6974
np.random.seed(self.opt.seed)
70-
self.alpha = np.random.uniform( \
71-
size=(self.opt.num_topics,)).astype(np.float32)
72-
self.beta = np.random.uniform( \
73-
size=(self.num_words, self.opt.num_topics)).astype(np.float32)
74-
self.beta /= np.sum(self.beta, axis=0)[None, :]
75-
self.logger.info("alpha %s, beta %s initialized",
76-
self.alpha.shape, self.beta.shape)
77-
78-
# zero initialize grad alpha and new beta
79-
block_cnt = self.obj.get_block_cnt()
80-
self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics),
81-
dtype=np.float32)
82-
self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
83-
self.logger.info("grad alpha %s, new beta %s initialized",
84-
self.grad_alpha.shape, self.new_beta.shape)
75+
self.emb_in = np.random.normal( \
76+
size=(self.num_words, self.opt.num_dims)).astype(np.float32)
77+
out_words = self.num_words if self.opt.neg else self.num_words - 1
78+
self.emb_out = np.random.uniform( \
79+
size=(out_words, self.opt.num_dims)).astype(np.float32)
80+
self.logger.info("emb_in %s, emb_out %s initialized",
81+
self.emb_in.shape, self.emb_out.shape)
8582

8683
# push it to gpu
87-
self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
84+
self.obj.load_model(self.emb_in, self.emb_out)
8885

8986
def train_model(self):
9087
self.preprocess_data()
9188
self.init_model()
89+
if not self.opt.neg:
90+
self.obj.build_huffman_tree(self.word_count)
9291
h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r")
9392
for epoch in range(1, self.opt.epochs + 1):
9493
self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
95-
self._train_e_step(h5f)
96-
self._train_m_step()
94+
self._train_epoch(h5f)
95+
self.pull()
9796
h5f.close()
9897

99-
def _train_e_step(self, h5f):
98+
def _train_epoch(self, h5f):
10099
offset, size = 0, h5f["cols"].shape[0]
101-
pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
102-
train_loss_nume, train_loss_deno = 0, 0
103-
vali_loss_nume, vali_loss_deno = 0, 0
100+
pbar = aux.Progbar(size, stateful_metrics=["loss"])
101+
loss_nume, loss_deno = 0, 0
104102
while True:
105103
target = h5f["indptr"][offset] + self.opt.batch_size
106104
if target < size:
@@ -111,57 +109,21 @@ def _train_e_step(self, h5f):
111109
beg, end = indptr[0], indptr[-1]
112110
indptr -= beg
113111
cols = h5f["cols"][beg:end]
114-
vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
115112
offset = next_offset
116113

117114
# call cuda kernel
118-
train_loss, vali_loss = \
119-
self.obj.feed_data(cols, indptr, vali, self.opt.num_iters_in_e_step)
115+
if self.opt.neg:
116+
self.obj.build_random_table( \
117+
self.word_count, self.opt.random_size, self.opt.num_threads)
118+
_loss_nume, _loss_deno = \
119+
self.obj.feed_data(cols, indptr)
120120

121121
# accumulate loss
122-
train_loss_nume -= train_loss
123-
vali_loss_nume -= vali_loss
124-
vali_cnt = np.count_nonzero(vali)
125-
train_cnt = len(vali) - vali_cnt
126-
train_loss_deno += train_cnt
127-
vali_loss_deno += vali_cnt
128-
train_loss = train_loss_nume / (train_loss_deno + EPS)
129-
vali_loss = vali_loss_nume / (vali_loss_deno + EPS)
122+
loss_nume += _loss_nume
123+
loss_deno += _loss_deno
124+
loss = loss_nume / (loss_deno + EPS)
130125

131126
# update progress bar
132-
pbar.update(end, values=[("train_loss", train_loss),
133-
("vali_loss", vali_loss)])
127+
pbar.update(end, values=[("loss", loss)])
134128
if end == size:
135129
break
136-
137-
def _train_m_step(self):
138-
self.obj.pull()
139-
140-
# update beta
141-
self.new_beta[:, :] = np.maximum(self.new_beta, EPS)
142-
self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :]
143-
self.new_beta[:, :] = 0
144-
145-
# update alpha
146-
alpha_sum = np.sum(self.alpha)
147-
gvec = np.sum(self.grad_alpha, axis=0)
148-
gvec += self.num_docs * (pg(0, alpha_sum) - pg(0, self.alpha))
149-
hvec = self.num_docs * pg(1, self.alpha)
150-
z_0 = pg(1, alpha_sum)
151-
c_nume = np.sum(gvec / hvec)
152-
c_deno = 1 / z_0 + np.sum(1 / hvec)
153-
c_0 = c_nume / c_deno
154-
delta = (gvec - c_0) / hvec
155-
self.alpha -= delta
156-
self.alpha[:] = np.maximum(self.alpha, EPS)
157-
self.grad_alpha[:,:] = 0
158-
159-
self.obj.push()
160-
161-
def save_model(self, model_path):
162-
self.logger.info("save model path: %s", model_path)
163-
h5f = h5py.File(model_path, "w")
164-
h5f.create_dataset("alpha", data=self.alpha)
165-
h5f.create_dataset("beta", data=self.beta)
166-
h5f.create_dataset("keys", data=np.array(self.words))
167-
h5f.close()

cusim/ioutils/bindings.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ class IoUtilsBind {
3535
return obj_.TokenizeStream(num_lines, num_threads);
3636
}
3737

38-
void GetWordVocab(int min_count, std::string keys_path) {
39-
obj_.GetWordVocab(min_count, keys_path);
38+
void GetWordVocab(int min_count, std::string keys_path, std::string count_path) {
39+
obj_.GetWordVocab(min_count, keys_path, count_path);
4040
}
4141

4242
void GetToken(py::object& rows, py::object& cols, py::object& indptr) {
@@ -62,7 +62,7 @@ PYBIND11_PLUGIN(ioutils_bind) {
6262
.def("tokenize_stream", &IoUtilsBind::TokenizeStream,
6363
py::arg("num_lines"), py::arg("num_threads"))
6464
.def("get_word_vocab", &IoUtilsBind::GetWordVocab,
65-
py::arg("min_count"), py::arg("keys_path"))
65+
py::arg("min_count"), py::arg("keys_path"), py::Arg("count_path"))
6666
.def("get_token", &IoUtilsBind::GetToken,
6767
py::arg("indices"), py::arg("indptr"), py::arg("offset"))
6868
.def("__repr__",

cusim/ioutils/pyioutils.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def __init__(self, opt=None):
3333
assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
3434
os.remove(tmp.name)
3535

36-
def load_stream_vocab(self, filepath, min_count, keys_path):
36+
def load_stream_vocab(self, filepath, min_count,
37+
keys_path, count_path):
3738
full_num_lines = self.obj.load_stream_file(filepath)
3839
pbar = aux.Progbar(full_num_lines, unit_name="line",
3940
stateful_metrics=["word_count"])
@@ -46,17 +47,18 @@ def load_stream_vocab(self, filepath, min_count, keys_path):
4647
pbar.update(processed, values=[("word_count", word_count)])
4748
if processed == full_num_lines:
4849
break
49-
self.obj.get_word_vocab(min_count, keys_path)
50+
self.obj.get_word_vocab(min_count, keys_path, count_path)
5051

5152
def convert_stream_to_h5(self, filepath, min_count, out_dir,
5253
chunk_indices=10000, seed=777):
5354
np.random.seed(seed)
5455
os.makedirs(out_dir, exist_ok=True)
5556
keys_path = pjoin(out_dir, "keys.txt")
57+
count_path = pjoin(out_dir, "count.txt")
5658
token_path = pjoin(out_dir, "token.h5")
57-
self.logger.info("save key and token to %s, %s",
58-
keys_path, token_path)
59-
self.load_stream_vocab(filepath, min_count, keys_path)
59+
self.logger.info("save key, count, token to %s, %s, %s",
60+
keys_path, count_path, token_path)
61+
self.load_stream_vocab(filepath, min_count, keys_path, count_path)
6062
full_num_lines = self.obj.load_stream_file(filepath)
6163
pbar = aux.Progbar(full_num_lines, unit_name="line")
6264
processed = 0

cusim/proto/config.proto

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,35 @@ message CuLDAConfigProto {
3131
optional double vali_p = 13 [default = 0.2];
3232
optional int32 seed = 14 [default = 777];
3333
}
34+
35+
message CuW2VConfigProto {
36+
required string data_path = 7;
37+
38+
optional int32 py_log_level = 1 [default = 2];
39+
optional int32 c_log_level = 2 [default = 2];
40+
41+
optional int32 num_dims = 3 [default = 50];
42+
optional int32 block_dim = 4 [default = 32];
43+
optional int32 hyper_threads = 5 [default = 10];
44+
optional string processed_data_dir = 6;
45+
optional bool skip_preprocess = 8;
46+
optional int32 word_min_count = 9 [default = 5];
47+
optional int32 batch_size = 10 [default = 100000];
48+
optional int32 epochs = 11 [default = 10];
49+
50+
// seed fields
51+
optional int32 seed = 14 [default = 777];
52+
optional int32 table_seed = 15 [default = 777];
53+
optional int32 cuda_seed = 16 [default = 777];
54+
optional int32 random_size = 17 [default = 1000000];
55+
56+
optional int32 neg = 17 [default = 10];
57+
// as recommended in w2v paper
58+
optional double count_power = 18 [default = 0.75];
59+
optional bool skip_gram = 19 [default = true];
60+
optional bool use_mean = 20 [default = true];
61+
optional double lr = 21 [default = 0.001];
62+
optional int32 window_size = 22 [default = 5];
63+
64+
65+
}

setup.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,21 @@ def __init__(self, name):
9898
"cpp/include/", np.get_include(), pybind11.get_include(),
9999
pybind11.get_include(True), CUDA['include'],
100100
"3rd/json11", "3rd/spdlog/include"]),
101+
Extension("cusim.cuw2v.cuw2v_bind",
102+
sources= util_srcs + [ \
103+
"cpp/src/cuw2v/cuw2v.cu",
104+
"cusim/cuw2v/bindings.cc",
105+
"3rd/json11/json11.cpp"],
106+
language="c++",
107+
extra_compile_args=extra_compile_args,
108+
extra_link_args=["-fopenmp"],
109+
library_dirs=[CUDA['lib64']],
110+
libraries=['cudart', 'cublas', 'curand'],
111+
extra_objects=[],
112+
include_dirs=[ \
113+
"cpp/include/", np.get_include(), pybind11.get_include(),
114+
pybind11.get_include(True), CUDA['include'],
115+
"3rd/json11", "3rd/spdlog/include"]),
101116
]
102117

103118

0 commit comments

Comments
 (0)