bug-fix in tokenization

js1010 · js1010 · commit b97c1489721f · 2021-02-10T23:33:09.000+09:00
diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -28,7 +28,7 @@ float Digamma(float x) {
 __global__ void EstepKernel(
   const int* cols, const int* indptr, const bool* vali,
   const int num_cols, const int num_indptr,
-  const int num_words, const int num_topics, const int num_iters,
+  const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   float* alpha, float* beta,
   float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
@@ -39,10 +39,8 @@ __global__ void EstepKernel(
   float* _phi = phi + num_topics * blockIdx.x;
   float* _grad_alpha = grad_alpha + num_topics * blockIdx.x;
 
-
   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
     int beg = indptr[i], end = indptr[i + 1];
-    
     // initialize gamma
     for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
       _gamma[j] = alpha[j] + (end - beg) / num_topics;
@@ -59,7 +57,6 @@ __global__ void EstepKernel(
       for (int k = beg; k < end; ++k) {
         const int w = cols[k];
         const bool _vali = vali[k];
-
         // compute phi
         if (not _vali or j + 1 == num_iters) {
           for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
@@ -39,7 +39,7 @@ bool CuLDA::Init(std::string opt_path) {
 void CuLDA::LoadModel(float* alpha, float* beta, 
     float* grad_alpha, float* new_beta, int num_words) {
   num_words_ = num_words;
-  DEBUG("copy model({} x {})", num_topics_, num_words_);
+  DEBUG("copy model({} x {})", num_words_, num_topics_);
   dev_alpha_.resize(num_topics_);
   dev_beta_.resize(num_topics_ * num_words_);
   thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
@@ -49,7 +49,7 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   // resize device vector
   grad_alpha_ = grad_alpha;
   new_beta_ = new_beta;
-  dev_grad_alpha_.resize(block_cnt_ * num_topics_);
+  dev_grad_alpha_.resize(num_topics_ * block_cnt_);
   dev_new_beta_.resize(num_topics_ * num_words_);
 
   // copy to device
@@ -74,15 +74,15 @@ std::pair<float, float> CuLDA::FeedData(
   thrust::copy(cols, cols + num_cols, dev_cols.begin());
   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
   thrust::copy(vali, vali + num_cols, dev_vali.begin());
-  
   CHECK_CUDA(cudaDeviceSynchronize());
+  DEBUG0("copy feed data to GPU memory");
 
   // run E step in GPU
   EstepKernel<<<block_cnt_, block_dim_>>>(
     thrust::raw_pointer_cast(dev_cols.data()),
     thrust::raw_pointer_cast(dev_indptr.data()),
     thrust::raw_pointer_cast(dev_vali.data()),
-    num_cols, num_indptr, num_words_, num_topics_, num_iters,
+    num_cols, num_indptr, num_topics_, num_iters,
     thrust::raw_pointer_cast(dev_gamma_.data()),
     thrust::raw_pointer_cast(dev_new_gamma_.data()),
     thrust::raw_pointer_cast(dev_phi_.data()),
@@ -92,14 +92,15 @@ std::pair<float, float> CuLDA::FeedData(
     thrust::raw_pointer_cast(dev_new_beta_.data()),
     thrust::raw_pointer_cast(dev_train_losses.data()),
     thrust::raw_pointer_cast(dev_vali_losses.data()));
-  
   CHECK_CUDA(cudaDeviceSynchronize());
+  DEBUG0("run E step in GPU");
 
   // pull loss
   std::vector<float> train_losses(block_cnt_), vali_losses(block_cnt_);
   thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin());
   thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
+  DEBUG0("pull loss values");
 
   // accumulate
   float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f);
diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
@@ -90,8 +90,8 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
 
       // tokenize
       for (auto& word: line_vec) {
-        if (not word_count_.count(word)) continue;
-        cols_[i].push_back(word_count_[word]);
+        if (not word_idmap_.count(word)) continue;
+        cols_[i].push_back(word_idmap_[word]);
       }
     }
   }
@@ -155,6 +155,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
 
 void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
   INFO("number of raw words: {}", word_count_.size());
+  word_idmap_.clear(); word_list_.clear();
   for (auto& it: word_count_) {
     if (it.second >= min_count) {
       word_idmap_[it.first] = word_idmap_.size();
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
@@ -47,7 +47,7 @@ class CuLDABind {
       throw std::runtime_error("invalid grad_alpha or new_beta");
     }
 
-    int num_words = beta_buffer.shape[1];
+    int num_words = beta_buffer.shape[0];
 
     return obj_.LoadModel(_alpha.mutable_data(0),
         _beta.mutable_data(0),
@@ -67,7 +67,7 @@ class CuLDABind {
       throw std::runtime_error("invalid cols or indptr");
     }
     int num_cols = cols_buffer.shape[0];
-    int num_indptr = indptr_buffer.shape[0];
+    int num_indptr = indptr_buffer.shape[0] - 1;
     return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0),
         num_cols, num_indptr, num_iters);
   }
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
@@ -19,6 +19,8 @@
 from cusim.culda.culda_bind import CuLDABind
 from cusim.config_pb2 import CuLDAConfigProto
 
+EPS = 1e-10
+
 class CuLDA:
   def __init__(self, opt=None):
     self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
@@ -77,6 +79,8 @@ def init_model(self):
     self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics),
                                dtype=np.float32)
     self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
+    self.logger.info("grad alpha %s, new beta %s initialized",
+                     self.grad_alpha.shape, self.new_beta.shape)
 
     # push it to gpu
     self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
@@ -118,10 +122,10 @@ def _train_e_step(self, h5f):
       vali_loss_nume -= vali_loss
       vali_cnt = np.count_nonzero(vali)
       train_cnt = len(vali) - vali_cnt
-      train_loss_nume += train_cnt
-      vali_loss_nume += train_cnt
-      train_loss = train_loss_nume / train_loss_deno
-      vali_loss = vali_loss_nume / vali_loss_deno
+      train_loss_deno += train_cnt
+      vali_loss_deno += vali_cnt
+      train_loss = train_loss_nume / (train_loss_deno + EPS)
+      vali_loss = vali_loss_nume / (vali_loss_deno + EPS)
 
       # update progress bar
       pbar.update(end, values=[("train_loss", train_loss),
diff --git a/examples/example1.py b/examples/example1.py
@@ -42,6 +42,8 @@ def run_lda():
   opt = {
     "data_path": DATA_PATH,
     "data_dir": DATA_PATH2,
+    # "skip_preprocess": True,
+    "c_log_level": 3,
   }
   lda = CuLDA(opt)
   lda.train_model()

Original file line number	Diff line number	Diff line change
`@@ -90,8 +90,8 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {`
`90`	`90`
`91`	`91`	`// tokenize`
`92`	`92`	`for (auto& word: line_vec) {`
`93`		`- if (not word_count_.count(word)) continue;`
`94`		`- cols_[i].push_back(word_count_[word]);`
	`93`	`+ if (not word_idmap_.count(word)) continue;`
	`94`	`+ cols_[i].push_back(word_idmap_[word]);`
`95`	`95`	`}`
`96`	`96`	`}`
`97`	`97`	`}`
`@@ -155,6 +155,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)`
`155`	`155`
`156`	`156`	`void IoUtils::GetWordVocab(int min_count, std::string keys_path) {`
`157`	`157`	`INFO("number of raw words: {}", word_count_.size());`
	`158`	`+ word_idmap_.clear(); word_list_.clear();`
`158`	`159`	`for (auto& it: word_count_) {`
`159`	`160`	`if (it.second >= min_count) {`
`160`	`161`	`word_idmap_[it.first] = word_idmap_.size();`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ class CuLDABind {`
`47`	`47`	`throw std::runtime_error("invalid grad_alpha or new_beta");`
`48`	`48`	`}`
`49`	`49`
`50`		`- int num_words = beta_buffer.shape[1];`
	`50`	`+ int num_words = beta_buffer.shape[0];`
`51`	`51`
`52`	`52`	`return obj_.LoadModel(_alpha.mutable_data(0),`
`53`	`53`	`_beta.mutable_data(0),`
`@@ -67,7 +67,7 @@ class CuLDABind {`
`67`	`67`	`throw std::runtime_error("invalid cols or indptr");`
`68`	`68`	`}`
`69`	`69`	`int num_cols = cols_buffer.shape[0];`
`70`		`- int num_indptr = indptr_buffer.shape[0];`
	`70`	`+ int num_indptr = indptr_buffer.shape[0] - 1;`
`71`	`71`	`return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0),`
`72`	`72`	`num_cols, num_indptr, num_iters);`
`73`	`73`	`}`
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,8 @@ def run_lda():`
`42`	`42`	`opt = {`
`43`	`43`	`"data_path": DATA_PATH,`
`44`	`44`	`"data_dir": DATA_PATH2,`
	`45`	`+ # "skip_preprocess": True,`
	`46`	`+ "c_log_level": 3,`
`45`	`47`	`}`
`46`	`48`	`lda = CuLDA(opt)`
`47`	`49`	`lda.train_model()`