first draft

js1010 · js1010 · commit b805ed0deab9 · 2021-02-13T00:31:38.000+09:00
diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp
@@ -33,7 +33,7 @@ class IoUtils {
   int LoadStreamFile(std::string filepath);
   std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
   std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
-  void GetWordVocab(int min_count, std::string keys_path);
+  void GetWordVocab(int min_count, std::string keys_path, std::string count_path);
   void GetToken(int* rows, int* cols, int* indptr);
  private:
   void ParseLine(std::string line, std::vector<std::string>& line_vec);
diff --git a/cpp/src/cuw2v/cuw2v.cu b/cpp/src/cuw2v/cuw2v.cu
@@ -44,7 +44,7 @@ bool CuW2V::Init(std::string opt_path) {
   lr_ = opt_["lr"].number_value();
 
   // if zero, we will use hierarchical softmax
-  neg_ = opt_["negative_sampling"].int_value(); 
+  neg_ = opt_["neg"].int_value(); 
   
   // random seed 
   table_seed_ = opt_["table_seed"].int_value();
diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
@@ -153,7 +153,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
   return {read_lines, word_count_.size()};
 }
 
-void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
+void IoUtils::GetWordVocab(int min_count, std::string keys_path, std::string count_path) {
   INFO("number of raw words: {}", word_count_.size());
   word_idmap_.clear(); word_list_.clear();
   for (auto& it: word_count_) {
@@ -164,13 +164,16 @@ void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
   }
   INFO("number of words after filtering: {}", word_list_.size());
 
-  // write keys to csv file
-  std::ofstream fout(keys_path.c_str());
+  // write keys and count to csv file
+  std::ofstream fout1(keys_path.c_str());
+  std::ofstream fout2(count_path.c_str());
   INFO("dump keys to {}", keys_path);
   int n = word_list_.size();
   for (int i = 0; i < n; ++i) {
     std::string line = word_list_[i] + "\n";
-    fout.write(line.c_str(), line.size());
+    fout1.write(line.c_str(), line.size());
+    line = std::to_string(word_count_[word_list_[i]]) + "\n";
+    fout2.write(line.c_str(), line.size());
   }
   fout.close();
 }
diff --git a/cusim/__init__.py b/cusim/__init__.py
@@ -5,3 +5,4 @@
 # LICENSE file in the root directory of this source tree.
 from cusim.ioutils import IoUtils
 from cusim.culda import CuLDA
+from cusim.cuw2v import CuW2V
diff --git a/cusim/cuw2v/pycuw2v.py b/cusim/cuw2v/pycuw2v.py
@@ -13,17 +13,16 @@
 
 import h5py
 import numpy as np
-from scipy.special import polygamma as pg
 
 from cusim import aux, IoUtils
-from cusim.culda.culda_bind import CuLDABind
-from cusim.config_pb2 import CuLDAConfigProto
+from cusim.cuw2v.cuw2v_bind import CuW2VBind
+from cusim.config_pb2 import CuW2VConfigProto
 
 EPS = 1e-10
 
-class CuLDA:
+class CuW2V:
   def __init__(self, opt=None):
-    self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
+    self.opt = aux.get_opt_as_proto(opt or {}, CuW2VConfigProto)
     self.logger = aux.get_logger("culda", level=self.opt.py_log_level)
 
     tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
@@ -32,13 +31,13 @@ def __init__(self, opt=None):
     tmp.close()
 
     self.logger.info("opt: %s", opt_content)
-    self.obj = CuLDABind()
+    self.obj = CuW2VBind()
     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
     os.remove(tmp.name)
 
-    self.words, self.num_words, self.num_docs = None, None, None
-    self.alpha, self.beta, self.grad_alpha, self.new_beta = \
+    self.words, self.word_count, self.num_words, self.num_docs = \
       None, None, None, None
+    self.emb_in, self.emb_out = None, None
 
   def preprocess_data(self):
     if self.opt.skip_preprocess:
@@ -52,9 +51,15 @@ def preprocess_data(self):
   def init_model(self):
     # load voca
     data_dir = self.opt.processed_data_dir
-    self.logger.info("load key from %s", pjoin(data_dir, "keys.txt"))
-    with open(pjoin(data_dir, "keys.txt"), "rb") as fin:
+    keys_path = pjoin(data_dir, "keys.txt")
+    count_path = pjoin(data_dir, "count.txt")
+    self.logger.info("load key, count from %s, %s", keys_path, count_path)
+    with open(keys_path, "rb") as fin:
       self.words = [line.strip() for line in fin]
+    with open(count_path, "rb") as fin:
+      self.word_count = np.array([float(line.strip()) for line in fin],
+                                 dtype=np.float32)
+    self.word_count = np.power(self.word_count, self.opt.count_power)
     self.num_words = len(self.words)
 
     # count number of docs
@@ -67,40 +72,33 @@ def init_model(self):
 
     # random initialize alpha and beta
     np.random.seed(self.opt.seed)
-    self.alpha = np.random.uniform( \
-      size=(self.opt.num_topics,)).astype(np.float32)
-    self.beta = np.random.uniform( \
-      size=(self.num_words, self.opt.num_topics)).astype(np.float32)
-    self.beta /= np.sum(self.beta, axis=0)[None, :]
-    self.logger.info("alpha %s, beta %s initialized",
-                     self.alpha.shape, self.beta.shape)
-
-    # zero initialize grad alpha and new beta
-    block_cnt = self.obj.get_block_cnt()
-    self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics),
-                               dtype=np.float32)
-    self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
-    self.logger.info("grad alpha %s, new beta %s initialized",
-                     self.grad_alpha.shape, self.new_beta.shape)
+    self.emb_in = np.random.normal( \
+      size=(self.num_words, self.opt.num_dims)).astype(np.float32)
+    out_words = self.num_words if self.opt.neg else self.num_words - 1
+    self.emb_out = np.random.uniform( \
+      size=(out_words, self.opt.num_dims)).astype(np.float32)
+    self.logger.info("emb_in %s, emb_out %s initialized",
+                     self.emb_in.shape, self.emb_out.shape)
 
     # push it to gpu
-    self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
+    self.obj.load_model(self.emb_in, self.emb_out)
 
   def train_model(self):
     self.preprocess_data()
     self.init_model()
+    if not self.opt.neg:
+      self.obj.build_huffman_tree(self.word_count)
     h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r")
     for epoch in range(1, self.opt.epochs + 1):
       self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
-      self._train_e_step(h5f)
-      self._train_m_step()
+      self._train_epoch(h5f)
+    self.pull()
     h5f.close()
 
-  def _train_e_step(self, h5f):
+  def _train_epoch(self, h5f):
     offset, size = 0, h5f["cols"].shape[0]
-    pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
-    train_loss_nume, train_loss_deno = 0, 0
-    vali_loss_nume, vali_loss_deno = 0, 0
+    pbar = aux.Progbar(size, stateful_metrics=["loss"])
+    loss_nume, loss_deno = 0, 0
     while True:
       target = h5f["indptr"][offset] + self.opt.batch_size
       if target < size:
@@ -111,57 +109,21 @@ def _train_e_step(self, h5f):
       beg, end = indptr[0], indptr[-1]
       indptr -= beg
       cols = h5f["cols"][beg:end]
-      vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
       offset = next_offset
 
       # call cuda kernel
-      train_loss, vali_loss = \
-        self.obj.feed_data(cols, indptr, vali, self.opt.num_iters_in_e_step)
+      if self.opt.neg:
+        self.obj.build_random_table( \
+          self.word_count, self.opt.random_size, self.opt.num_threads)
+      _loss_nume, _loss_deno = \
+        self.obj.feed_data(cols, indptr)
 
       # accumulate loss
-      train_loss_nume -= train_loss
-      vali_loss_nume -= vali_loss
-      vali_cnt = np.count_nonzero(vali)
-      train_cnt = len(vali) - vali_cnt
-      train_loss_deno += train_cnt
-      vali_loss_deno += vali_cnt
-      train_loss = train_loss_nume / (train_loss_deno + EPS)
-      vali_loss = vali_loss_nume / (vali_loss_deno + EPS)
+      loss_nume += _loss_nume
+      loss_deno += _loss_deno
+      loss = loss_nume / (loss_deno + EPS)
 
       # update progress bar
-      pbar.update(end, values=[("train_loss", train_loss),
-                               ("vali_loss", vali_loss)])
+      pbar.update(end, values=[("loss", loss)])
       if end == size:
         break
-
-  def _train_m_step(self):
-    self.obj.pull()
-
-    # update beta
-    self.new_beta[:, :] = np.maximum(self.new_beta, EPS)
-    self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :]
-    self.new_beta[:, :] = 0
-
-    # update alpha
-    alpha_sum = np.sum(self.alpha)
-    gvec = np.sum(self.grad_alpha, axis=0)
-    gvec += self.num_docs * (pg(0, alpha_sum) - pg(0, self.alpha))
-    hvec = self.num_docs * pg(1, self.alpha)
-    z_0 = pg(1, alpha_sum)
-    c_nume = np.sum(gvec / hvec)
-    c_deno = 1 / z_0 + np.sum(1 / hvec)
-    c_0 = c_nume / c_deno
-    delta = (gvec - c_0) / hvec
-    self.alpha -= delta
-    self.alpha[:] = np.maximum(self.alpha, EPS)
-    self.grad_alpha[:,:] = 0
-
-    self.obj.push()
-
-  def save_model(self, model_path):
-    self.logger.info("save model path: %s", model_path)
-    h5f = h5py.File(model_path, "w")
-    h5f.create_dataset("alpha", data=self.alpha)
-    h5f.create_dataset("beta", data=self.beta)
-    h5f.create_dataset("keys", data=np.array(self.words))
-    h5f.close()
diff --git a/cusim/ioutils/bindings.cc b/cusim/ioutils/bindings.cc
@@ -35,8 +35,8 @@ class IoUtilsBind {
     return obj_.TokenizeStream(num_lines, num_threads);
   }
 
-  void GetWordVocab(int min_count, std::string keys_path) {
-    obj_.GetWordVocab(min_count, keys_path);
+  void GetWordVocab(int min_count, std::string keys_path, std::string count_path) {
+    obj_.GetWordVocab(min_count, keys_path, count_path);
   }
 
   void GetToken(py::object& rows, py::object& cols, py::object& indptr) {
@@ -62,7 +62,7 @@ PYBIND11_PLUGIN(ioutils_bind) {
   .def("tokenize_stream", &IoUtilsBind::TokenizeStream,
       py::arg("num_lines"), py::arg("num_threads"))
   .def("get_word_vocab", &IoUtilsBind::GetWordVocab,
-      py::arg("min_count"), py::arg("keys_path"))
+      py::arg("min_count"), py::arg("keys_path"), py::Arg("count_path"))
   .def("get_token", &IoUtilsBind::GetToken,
       py::arg("indices"), py::arg("indptr"), py::arg("offset"))
   .def("__repr__",
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
@@ -33,7 +33,8 @@ def __init__(self, opt=None):
     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
     os.remove(tmp.name)
 
-  def load_stream_vocab(self, filepath, min_count, keys_path):
+  def load_stream_vocab(self, filepath, min_count,
+                        keys_path, count_path):
     full_num_lines = self.obj.load_stream_file(filepath)
     pbar = aux.Progbar(full_num_lines, unit_name="line",
                         stateful_metrics=["word_count"])
@@ -46,17 +47,18 @@ def load_stream_vocab(self, filepath, min_count, keys_path):
       pbar.update(processed, values=[("word_count", word_count)])
       if processed == full_num_lines:
         break
-    self.obj.get_word_vocab(min_count, keys_path)
+    self.obj.get_word_vocab(min_count, keys_path, count_path)
 
   def convert_stream_to_h5(self, filepath, min_count, out_dir,
                            chunk_indices=10000, seed=777):
     np.random.seed(seed)
     os.makedirs(out_dir, exist_ok=True)
     keys_path = pjoin(out_dir, "keys.txt")
+    count_path = pjoin(out_dir, "count.txt")
     token_path = pjoin(out_dir, "token.h5")
-    self.logger.info("save key and token to %s, %s",
-                     keys_path, token_path)
-    self.load_stream_vocab(filepath, min_count, keys_path)
+    self.logger.info("save key, count, token to %s, %s, %s",
+                     keys_path, count_path, token_path)
+    self.load_stream_vocab(filepath, min_count, keys_path, count_path)
     full_num_lines = self.obj.load_stream_file(filepath)
     pbar = aux.Progbar(full_num_lines, unit_name="line")
     processed = 0
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
@@ -31,3 +31,35 @@ message CuLDAConfigProto {
   optional double vali_p = 13 [default = 0.2];
   optional int32 seed = 14 [default = 777];
 }
+
+message CuW2VConfigProto {
+  required string data_path = 7;
+
+  optional int32 py_log_level = 1 [default = 2];
+  optional int32 c_log_level = 2 [default = 2];
+
+  optional int32 num_dims = 3 [default = 50];
+  optional int32 block_dim = 4 [default = 32];
+  optional int32 hyper_threads = 5 [default = 10];
+  optional string processed_data_dir = 6;
+  optional bool skip_preprocess = 8;
+  optional int32 word_min_count = 9 [default = 5];
+  optional int32 batch_size = 10 [default = 100000];
+  optional int32 epochs = 11 [default = 10];
+
+  // seed fields
+  optional int32 seed = 14 [default = 777];
+  optional int32 table_seed = 15 [default = 777];
+  optional int32 cuda_seed = 16 [default = 777];
+  optional int32 random_size = 17 [default = 1000000];
+
+  optional int32 neg = 17 [default = 10];
+  // as recommended in w2v paper
+  optional double count_power = 18 [default = 0.75];
+  optional bool skip_gram = 19 [default = true];
+  optional bool use_mean = 20 [default = true];
+  optional double lr = 21 [default = 0.001];
+  optional int32 window_size = 22 [default = 5];
+
+
+}
diff --git a/setup.py b/setup.py
@@ -98,6 +98,21 @@ def __init__(self, name):
               "cpp/include/", np.get_include(), pybind11.get_include(),
               pybind11.get_include(True), CUDA['include'],
               "3rd/json11", "3rd/spdlog/include"]),
+  Extension("cusim.cuw2v.cuw2v_bind",
+            sources= util_srcs + [ \
+              "cpp/src/cuw2v/cuw2v.cu",
+              "cusim/cuw2v/bindings.cc",
+              "3rd/json11/json11.cpp"],
+            language="c++",
+            extra_compile_args=extra_compile_args,
+            extra_link_args=["-fopenmp"],
+            library_dirs=[CUDA['lib64']],
+            libraries=['cudart', 'cublas', 'curand'],
+            extra_objects=[],
+            include_dirs=[ \
+              "cpp/include/", np.get_include(), pybind11.get_include(),
+              pybind11.get_include(True), CUDA['include'],
+              "3rd/json11", "3rd/spdlog/include"]),
 ]