dump keys

js1010 · js1010 · commit 6eed2f8ded9f · 2021-02-07T10:25:00.000+09:00
diff --git a/cpp/include/ioutils.hpp b/cpp/include/ioutils.hpp
@@ -33,10 +33,15 @@ class IoUtils {
   bool Init(std::string opt_path);
   int LoadStreamFile(std::string filepath);
   std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
-  void GetWordVocab(int min_count);
+  std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
+  void GetWordVocab(int min_count, std::string keys_path);
+  void GetToken(int* indices, int* indptr, int offset);
  private:
   void ParseLine(std::string line, std::vector<std::string>& line_vec);
+  void ParseLineImpl(std::string line, std::vector<std::string>& line_vec);
 
+  std::vector<std::vector<int>> indices_;
+  std::vector<int> indptr_;
   std::mutex global_lock_;
   std::ifstream stream_fin_;
   json11::Json opt_;
diff --git a/cpp/src/ioutils.cc b/cpp/src/ioutils.cc
@@ -28,14 +28,19 @@ bool IoUtils::Init(std::string opt_path) {
 }
 
 void IoUtils::ParseLine(std::string line, std::vector<std::string>& ret) {
+  ParseLineImpl(line, ret);
+}
+
+
+void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
   ret.clear();
   int n = line.size();
   std::string element;
   for (int i = 0; i < n; ++i) {
-    if (line[i] == ' ') {
+    if (line[i] == ' ' or line[i] == ',') {
       ret.push_back(element);
       element.clear();
-    } else {
+    } else if (line[i] != '"') {
       element += line[i];
     }
   }
@@ -61,6 +66,56 @@ int IoUtils::LoadStreamFile(std::string filepath) {
   return count;
 }
 
+std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
+  int read_lines = std::min(num_lines, remain_lines_);
+  if (not read_lines) return {0, 0};
+  remain_lines_ -= read_lines;
+  indices_.clear();
+  indices_.resize(read_lines);
+  indptr_.resize(read_lines);
+  std::fill(indptr_.begin(), indptr_.end(), 0);
+  #pragma omp parallel num_threads(num_threads)
+  {
+    std::string line;
+    std::vector<std::string> line_vec;
+    #pragma omp for schedule(dynamic, 4)
+    for (int i = 0; i < read_lines; ++i) {
+      // get line thread-safely
+      {
+        std::unique_lock<std::mutex> lock(global_lock_);
+        getline(stream_fin_, line);
+      }
+
+      // seems to be bottle-neck
+      ParseLine(line, line_vec);
+
+      // tokenize
+      for (auto& word: line_vec) {
+        if (word_count_.count(word)) continue;
+        indices_[i].push_back(word_count_[word]);
+      }
+    }
+  }
+  int cumsum = 0;
+  for (int i = 0; i < read_lines; ++i) {
+    cumsum += indices_[i].size();
+    indptr_[i] = cumsum;
+  }
+  return {read_lines, indptr_[read_lines - 1]};
+}
+
+void IoUtils::GetToken(int* indices, int* indptr, int offset) {
+  int n = indices_.size();
+  for (int i = 0; i < n; ++i) {
+    int beg = i == 0? 0: indptr_[i - 1];
+    int end = indptr_[i];
+    for (int j = beg; j < end; ++j) {
+      indices[j] = indices_[i][j - beg];
+    }
+    indptr[i] = offset + indptr_[i];
+  }
+}
+
 std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads) {
   int read_lines = std::min(num_lines, remain_lines_);
   remain_lines_ -= read_lines;
@@ -77,7 +132,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
         getline(stream_fin_, line);
       }
 
-      // seems to bottle-neck
+      // seems to be bottle-neck
       ParseLine(line, line_vec);
 
       // update private word count
@@ -94,10 +149,10 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
       }
     }
   }
-  return {read_lines, remain_lines_};
+  return {read_lines, word_count_.size()};
 }
 
-void IoUtils::GetWordVocab(int min_count) {
+void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
   INFO("number of raw words: {}", word_count_.size());
   for (auto& it: word_count_) {
     if (it.second >= min_count) {
@@ -106,6 +161,18 @@ void IoUtils::GetWordVocab(int min_count) {
     }
   }
   INFO("number of words after filtering: {}", word_list_.size());
+
+  // write keys to csv file
+  std::ofstream fout(keys_path.c_str());
+  INFO("dump keys to {}", keys_path);
+  std::string header = "index,key\n";
+  fout.write(header.c_str(), header.size());
+  int n = word_list_.size();
+  for (int i = 0; i < n; ++i) {
+    std::string line = std::to_string(i) + ",\"" + word_list_[i] + "\"\n";
+    fout.write(line.c_str(), line.size());
+  }
+  fout.close();
 }
 
 }  // namespace cusim
diff --git a/cusim/ioutils/bindings.cc b/cusim/ioutils/bindings.cc
@@ -31,8 +31,18 @@ class IoUtilsBind {
     return obj_.ReadStreamForVocab(num_lines, num_threads);
   }
 
-  void GetWordVocab(int min_count) {
-    return obj_.GetWordVocab(min_count);
+  std::pair<int, int> TokenizeStream(int num_lines, int num_threads) {
+    return obj_.TokenizeStream(num_lines, num_threads);
+  }
+
+  void GetWordVocab(int min_count, std::string keys_path) {
+    obj_.GetWordVocab(min_count, keys_path);
+  }
+
+  void GetToken(py::object& indices, py::object& indptr, int offset) {
+    int_array _indices(indices);
+    int_array _indptr(indptr);
+    obj_.GetToken(_indices.mutable_data(0), _indptr.mutable_data(0), offset);
   }
 
  private:
@@ -48,7 +58,12 @@ PYBIND11_PLUGIN(ioutils_bind) {
   .def("load_stream_file", &IoUtilsBind::LoadStreamFile, py::arg("filepath"))
   .def("read_stream_for_vocab", &IoUtilsBind::ReadStreamForVocab,
       py::arg("num_lines"), py::arg("num_threads"))
-  .def("get_word_vocab", &IoUtilsBind::GetWordVocab, py::arg("min_count"))
+  .def("tokenize_stream", &IoUtilsBind::TokenizeStream,
+      py::arg("num_lines"), py::arg("num_threads"))
+  .def("get_word_vocab", &IoUtilsBind::GetWordVocab,
+      py::arg("min_count"), py::arg("keys_path"))
+  .def("get_token", &IoUtilsBind::GetToken,
+      py::arg("indices"), py::arg("indptr"), py::arg("offset"))
   .def("__repr__",
   [](const IoUtilsBind &a) {
     return "<IoUtilsBind>";
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
@@ -6,10 +6,11 @@
 
 # pylint: disable=no-name-in-module,too-few-public-methods,no-member
 import os
+from os.path import join as pjoin
+
 import json
 import tempfile
 import tqdm
-
 from cusim import aux
 from cusim.ioutils.ioutils_bind import IoUtilsBind
 from cusim.config_pb2 import IoUtilsConfigProto
@@ -29,15 +30,24 @@ def __init__(self, opt=None):
     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
     os.remove(tmp.name)
 
-  def load_stream_vocab(self, filepath, min_count,
-                        chunk_lines=100000, num_threads=4):
+  def load_stream_vocab(self, filepath, min_count, keys_path):
     full_num_lines = self.obj.load_stream_file(filepath)
-    pbar = tqdm.trange(full_num_lines)
+    pbar = tqdm.trange(full_num_lines, unit="line",
+                       postfix={"word_count": 0})
+    processed = 0
     while True:
-      read_lines, remain_lines = \
-        self.obj.read_stream_for_vocab(chunk_lines, num_threads)
+      read_lines, word_count = \
+        self.obj.read_stream_for_vocab(
+          self.opt.chunk_lines, self.opt.num_threads)
+      processed += read_lines
+      pbar.set_postfix({"word_count": word_count}, refresh=False)
       pbar.update(read_lines)
-      if not remain_lines:
+      if processed == full_num_lines:
         break
     pbar.close()
-    self.obj.get_word_vocab(min_count)
+    self.obj.get_word_vocab(min_count, keys_path)
+
+  def convert_stream_to_h5(self, filepath, min_count, out_dir):
+    os.makedirs(out_dir, exist_ok=True)
+    keys_path = pjoin(out_dir, "keys.csv")
+    self.load_stream_vocab(filepath, min_count, keys_path)
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
@@ -9,4 +9,6 @@ syntax = "proto2";
 message IoUtilsConfigProto {
   optional int32 py_log_level = 1 [default = 2];
   optional int32 c_log_level = 2 [default = 2];
+  optional int32 chunk_lines = 3 [default = 100000];
+  optional int32 num_threads = 4 [default = 4];
 }
diff --git a/examples/example1.py b/examples/example1.py
@@ -17,6 +17,7 @@
 # DATASET = "wiki-english-20171001"
 DATASET = "fake-news"
 DATA_PATH = f"./res/{DATASET}.stream.txt"
+DATA_PATH2 = f"./res/{DATASET}-converted"
 MIN_COUNT = 5
 
 def download():
@@ -33,8 +34,8 @@ def download():
 
 def run():
   download()
-  iou = IoUtils()
-  iou.load_stream_vocab(DATA_PATH, 5, 100000, 8)
+  iou = IoUtils(opt={"chunk_lines": 10000, "num_threads": 8})
+  iou.convert_stream_to_h5(DATA_PATH, 5, DATA_PATH2)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+tqdm
 jsmin
 numpy
 pybind11

Original file line number	Diff line number	Diff line change
`@@ -9,4 +9,6 @@ syntax = "proto2";`
`9`	`9`	`message IoUtilsConfigProto {`
`10`	`10`	`optional int32 py_log_level = 1 [default = 2];`
`11`	`11`	`optional int32 c_log_level = 2 [default = 2];`
	`12`	`+ optional int32 chunk_lines = 3 [default = 100000];`
	`13`	`+ optional int32 num_threads = 4 [default = 4];`
`12`	`14`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+tqdm`
`1`	`2`	`jsmin`
`2`	`3`	`numpy`
`3`	`4`	`pybind11`