add files for pypi

js1010 · js1010 · commit 30af2c7b4a19 · 2021-02-16T00:39:40.000+09:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,14 @@
+include cuda_setup.py
+include requirements.txt
+include pyproject.toml
+recursive-include cpp/src/cuw2v/ *.cu
+recursive-include cpp/src/culda/ *.cu
+recursive-include cpp/src/ioutils/ *.cc
+recursive-include cpp/include/cuw2v/ *.cuh
+recursive-include cpp/include/cuw2v/ *.hpp
+recursive-include cpp/include/culda/ *.cuh
+recursive-include cpp/include/culda/ *.hpp
+recursive-include cpp/include/ioutils/ *.cuh
+recursive-include cpp/include/ioutils/ *.hpp
+recursive-include 3rd/json11/ *
+recursive-include 3rd/spdlog/ *
diff --git a/README.md b/README.md
@@ -29,10 +29,10 @@ python setup.py install
 
 ### Performance
 
-- [AWS P3 2xlarge instance](https://aws.amazon.com/ec2/instance-types/p3/) is used to the experiment. (One Tesla V100 GPU with 8 vcpus)
-- results can be reproduced by running `examples/example_w2v.py` and `examples/example_lda.py`
-- To evaluate w2v model, we used `evaluate_word_pairs` function ([ref link](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#evaluating)) in gensim, note that better performance on WS-353 test set does not mean that the model will workbetter in application as desribed on the link. However, it is good to be measured quantitively and fast training time will be at least very objective measure of performaance.
-  - I trained W2V model on quora-duplicat-questions dataset from gensim downloader api with cusim and the performance with gensim.
+- [AWS g4dn 2xlarge instance](https://aws.amazon.com/ec2/instance-types/g4/) is used to the experiment. (One NVIDIA T4 GPU with 8 vcpus, Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz)
+- results can be reproduced by simply running `examples/example_w2v.py` and `examples/example_lda.py`
+- To evaluate w2v model, we used `evaluate_word_pairs` function ([ref link](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#evaluating)) in gensim, note that better performance on WS-353 test set does not necessarily mean that the model will workbetter in application as desribed on the link. However, it is good to be measured quantitively and fast training time will be at least very objective measure of the performaance.
+  - I trained W2V model on `quora-duplicat-questions` dataset from gensim downloader api on GPU with cusim and compare the performance (both speed and model quality) with gensim.
 - To evaluate LDA model, I think there is no good way to measure the quality of traing results quantitatively. But we can check the model by looking at the top words of each topic. Also, we can compare the training time here.
 - W2V (CBOW, negative sampling)
 
@@ -42,6 +42,12 @@ python setup.py install
 | pearson       |    0.203882 |    0.207705 |    0.221758 |    0.198408 | **0.331749** |
 | spearman      |    0.25208  |    0.254706 |    0.275231 |    0.238611 | **0.295346** |
 
+
+- LDA (`nytimes` dataset from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
+  - I found that setting `workers` variable in gensim LdaMulticore does not work properly (it uses all cores in instance anyway), so I just compared the speed between cusim with single GPU and gensim with 8 vcpus. 
+  - One can compare the quality of modeling by looking at `examples/cusim.topics.txt` and `examples/gensim.topics.txt`.
+
+
 ### Future tasks
 
 - support half precision
diff --git a/examples/example_lda.py b/examples/example_lda.py
@@ -17,6 +17,7 @@
 import wget
 import h5py
 import numpy as np
+import pandas as pd
 
 # import gensim
 from gensim.models.ldamulticore import LdaMulticore
@@ -77,11 +78,12 @@ def run_cusim():
   start = time.time()
   lda = CuLDA(opt)
   lda.train_model()
-  LOGGER.info("elapsed for training LDA using cusim: %.4e sec",
-              time.time() - start)
+  el0 = time.time() - start
+  LOGGER.info("elapsed for training LDA using cusim: %.4e sec", el0)
   h5_model_path = pjoin(DIR_PATH, "cusim.lda.model.h5")
   lda.save_h5_model(h5_model_path)
   show_cusim_topics(h5_model_path)
+  return el0
 
 def show_cusim_topics(h5_model_path, topk=10):
   h5f = h5py.File(h5_model_path, "r")
@@ -129,15 +131,15 @@ def run_gensim():
       id2word[idx] = line.strip()
 
   start = time.time()
-  # 3 = real cores - 1
   lda = LdaMulticore(docs, num_topics=50, workers=None,
-                     id2word=id2word, iterations=10)
-  LOGGER.info("elapsed for training lda using gensim: %.4e sec",
-              time.time() - start)
+                    id2word=id2word, iterations=10)
+  el0 = time.time() - start
+  LOGGER.info("elapsed for training lda using gensim: %.4e sec", el0)
   model_path = pjoin(DIR_PATH, "gensim.lda.model")
   LOGGER.info("save gensim lda model to %s", model_path)
   lda.save(model_path)
   show_gensim_topics(model_path)
+  return el0
 
 def show_gensim_topics(model_path=None, topk=10):
   # load beta
@@ -174,5 +176,14 @@ def show_topics(beta, keys, topk, result_path):
   fout.close()
 
 
+def run_experiments():
+  training_time = {"attr": "training time (sec)"}
+  training_time["gensim (8 vpus)"] = run_gensim()
+  training_time["cusim"] = run_cusim()
+  df0 = pd.DataFrame([training_time])
+  df0.set_index("attr", inplace=True)
+  print(df0.to_markdown())
+
+
 if __name__ == "__main__":
   fire.Fire()
diff --git a/examples/example_w2v.py b/examples/example_w2v.py
@@ -126,23 +126,38 @@ def evaluate_w2v_model(model=GENSIM_MODEL):
   LOGGER.info("evaluation results: %s", results)
   return results
 
-def run_experiments(sg0=False, hs0=False):
-  training_time = {"attr": "training_time"}
+# gpu model variable is for being displayed in markdown
+# please put the real gpu modelname
+def run_experiments(skip_gram=False, hierarchical_softmax=False,
+                    gpu_model="NVIDIA T4"):
+  training_time = {"attr": "training time (sec)"}
   pearson = {"attr": "pearson"}
   spearman = {"attr": "spearman"}
   for i in [1, 2, 4, 8]:
-    elapsed, evals = run_gensim(sg0, hs0, i)
-    training_time[f"{i} workers"] = elapsed
-    pearson[f"{i} workers"] = evals[0][0]
-    spearman[f"{i} workers"] = evals[1][0]
-  elapsed, evals = run_cusim(sg0, hs0)
-  training_time["GPU"] = elapsed
-  pearson["GPU"] = evals[0][0]
-  spearman["GPU"] = evals[1][0]
+    elapsed, evals = run_gensim(skip_gram, hierarchical_softmax, i)
+    training_time[f"{i} workers (gensim)"] = elapsed
+    pearson[f"{i} workers (gensim)"] = evals[0][0]
+    spearman[f"{i} workers (gensim)"] = evals[1][0]
+  elapsed, evals = run_cusim(skip_gram, hierarchical_softmax)
+  gpu_title = f"{gpu_model} (cusim)"
+  training_time[gpu_title] = elapsed
+  pearson[gpu_title] = evals[0][0]
+  spearman[gpu_title] = evals[1][0]
   df0 = pd.DataFrame([training_time, pearson, spearman])
   df0.set_index("attr", inplace=True)
   print(df0.to_markdown())
 
+# gpu model variable is for being displayed in markdown
+# please put the real gpu modelname
+def run_various_experiments(gpu_model="NVIDIA T4"):
+  for sg0 in [True, False]:
+    for hs0 in [True, False]:
+      print("=" * 100)
+      LOGGER.info("setting: %s, %s",
+                  "skip gram" if sg0 else "cbow",
+                  "hierarchical softmax" if hs0 else "negative sampling")
+      run_experiments(sg0, hs0, gpu_model)
+
 
 if __name__ == "__main__":
   fire.Fire()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=1.3.2",
+    "numpy",
+    "pybind11"
+]