change proto fieldname

js1010 · js1010 · commit 38c83db71b09 · 2021-02-11T00:20:11.000+09:00
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
@@ -44,20 +44,21 @@ def preprocess_data(self):
     if self.opt.skip_preprocess:
       return
     iou = IoUtils()
-    if not self.opt.data_dir:
-      self.opt.data_dir = tempfile.TemporaryDirectory().name
+    if not self.opt.processed_data_dir:
+      self.opt.processed_data_dir = tempfile.TemporaryDirectory().name
     iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count,
-                             self.opt.data_dir)
+                             self.opt.processed_data_dir)
 
   def init_model(self):
     # load voca
-    self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt"))
-    with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin:
+    data_dir = self.opt.processed_data_dir
+    self.logger.info("load key from %s", pjoin(data_dir, "keys.txt"))
+    with open(pjoin(data_dir, "keys.txt"), "rb") as fin:
       self.words = [line.strip() for line in fin]
     self.num_words = len(self.words)
 
     # count number of docs
-    h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r")
+    h5f = h5py.File(pjoin(data_dir, "token.h5"), "r")
     self.num_docs = h5f["indptr"].shape[0] - 1
     h5f.close()
 
@@ -88,7 +89,7 @@ def init_model(self):
   def train_model(self):
     self.preprocess_data()
     self.init_model()
-    h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r")
+    h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r")
     for epoch in range(1, self.opt.epochs + 1):
       self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
       self._train_e_step(h5f)
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
@@ -22,7 +22,7 @@ message CuLDAConfigProto {
   optional int32 num_topics = 3 [default = 10];
   optional int32 block_dim = 4 [default = 32];
   optional int32 hyper_threads = 5 [default = 10];
-  optional string data_dir = 6;
+  optional string processed_data_dir = 6;
   optional bool skip_preprocess = 8;
   optional int32 word_min_count = 9 [default = 5];
   optional int32 batch_size = 10 [default = 100000];
diff --git a/examples/example1.py b/examples/example1.py
@@ -17,10 +17,12 @@
 LOGGER = aux.get_logger()
 DOWNLOAD_PATH = "./res"
 # DATASET = "wiki-english-20171001"
-DATASET = "fake-news"
+DATASET = "quora-duplicate-questions"
 DATA_PATH = f"./res/{DATASET}.stream.txt"
-DATA_PATH2 = f"./res/{DATASET}-converted"
+LDA_PATH = f"./res/{DATASET}-lda.h5"
+PROCESSED_DATA_DIR = f"./res/{DATASET}-converted"
 MIN_COUNT = 5
+TOPK = 10
 
 def download():
   if os.path.exists(DATA_PATH):
@@ -37,28 +39,28 @@ def download():
 def run_io():
   download()
   iou = IoUtils(opt={"chunk_lines": 10000, "num_threads": 8})
-  iou.convert_stream_to_h5(DATA_PATH, 5, DATA_PATH2)
+  iou.convert_stream_to_h5(DATA_PATH, 5, PROCESSED_DATA_DIR)
 
 
 def run_lda():
   opt = {
     "data_path": DATA_PATH,
-    "data_dir": DATA_PATH2,
-    # "skip_preprocess": True,
-    # "c_log_level": 3,
+    "processed_data_dir": PROCESSED_DATA_DIR,
   }
   lda = CuLDA(opt)
   lda.train_model()
-  lda.save_model("res/lda.h5")
-  h5f = h5py.File("res/lda.h5", "r")
+  lda.save_model(LDA_PATH)
+  h5f = h5py.File(LDA_PATH, "r")
   beta = h5f["beta"][:]
-  for i in range(lda.opt.num_topics):
+  word_list = h5f["keys"][:]
+  num_topics = h5f["alpha"].shape[0]
+  for i in range(num_topics):
     print("=" * 50)
     print(f"topic {i + 1}")
     words = np.argsort(-beta.T[i])[:10]
     print("-" * 50)
-    for j in range(10):
-      word = lda.words[words[j]].decode("utf8")
+    for j in range(TOPK):
+      word = word_list[words[j]].decode("utf8")
       prob = beta[words[j], i]
       print(f"rank {j + 1}. word: {word}, prob: {prob}")
   h5f.close()