use mutex

js1010 · js1010 · commit c064f69f4e63 · 2021-02-11T14:22:36.000+09:00
diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -31,7 +31,8 @@ __global__ void EstepKernel(
   const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   const float* alpha, const float* beta,
-  float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
+  float* grad_alpha, float* new_beta, 
+  float* train_losses, float* vali_losses, int* mutex) {
   
   // storage for block
   float* _gamma = gamma + num_topics * blockIdx.x;
@@ -57,6 +58,7 @@ __global__ void EstepKernel(
       for (int k = beg; k < end; ++k) {
         const int w = cols[k];
         const bool _vali = vali[k];
+        
         // compute phi
         if (not _vali or j + 1 == num_iters) {
           for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
@@ -65,17 +67,33 @@ __global__ void EstepKernel(
           
           // normalize phi and add it to new gamma and new beta
           float phi_sum = ReduceSum(_phi, num_topics);
+
           for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
             _phi[l] /= phi_sum;
             if (not _vali) _new_gamma[l] += _phi[l];
+          }
+          __syncthreads();
+        }
+        
+        if (j + 1 == num_iters) {
+          // write access of w th vector of new_beta 
+          if (threadIdx.x == 0) {
+            while (atomicCAS(&mutex[w], 0, 1)) {}
+          } 
+
+          __syncthreads();
+          for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
             if (j + 1 == num_iters) { 
               if (not _vali) new_beta[w * num_topics + l] += _phi[l];
               _phi[l] *= beta[w * num_topics + l];
             }
           }
           __syncthreads();
-        }
-        if (j + 1 == num_iters) {
+
+          // release lock
+          if (threadIdx.x == 0) mutex[w] = 0;
+          __syncthreads();
+
           float p = fmaxf(EPS, ReduceSum(_phi, num_topics));
           if (threadIdx.x == 0) {
             if (_vali)
diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
@@ -70,13 +70,16 @@ class CuLDA {
   void Pull();
   void Push();
   int GetBlockCnt();
+
  private:
   DeviceInfo dev_info_;
   json11::Json opt_;
   std::shared_ptr<spdlog::logger> logger_;
   thrust::device_vector<float> dev_alpha_, dev_beta_;
   thrust::device_vector<float> dev_grad_alpha_, dev_new_beta_;
   thrust::device_vector<float> dev_gamma_, dev_new_gamma_, dev_phi_;
+  thrust::device_vector<int> dev_mutex_;
+
   float *alpha_, *beta_, *grad_alpha_, *new_beta_;
   int block_cnt_, block_dim_;
   int num_topics_, num_words_;
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
@@ -51,13 +51,18 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   new_beta_ = new_beta;
   dev_grad_alpha_.resize(num_topics_ * block_cnt_);
   dev_new_beta_.resize(num_topics_ * num_words_);
-
   // copy to device
   thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin());
   thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
   dev_gamma_.resize(num_topics_ * block_cnt_);
   dev_new_gamma_.resize(num_topics_ * block_cnt_);
   dev_phi_.resize(num_topics_ * block_cnt_);
+  
+  // set mutex
+  dev_mutex_.resize(num_words_);
+  std::vector<int> host_mutex(num_words_, 0);
+  thrust::copy(host_mutex.begin(), host_mutex.end(), dev_mutex_.begin());
+  
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
@@ -91,7 +96,8 @@ std::pair<float, float> CuLDA::FeedData(
     thrust::raw_pointer_cast(dev_grad_alpha_.data()),
     thrust::raw_pointer_cast(dev_new_beta_.data()),
     thrust::raw_pointer_cast(dev_train_losses.data()),
-    thrust::raw_pointer_cast(dev_vali_losses.data()));
+    thrust::raw_pointer_cast(dev_vali_losses.data()),
+    thrust::raw_pointer_cast(dev_mutex_.data()));
   CHECK_CUDA(cudaDeviceSynchronize());
   DEBUG0("run E step in GPU");
 
diff --git a/examples/example1.py b/examples/example1.py
@@ -46,6 +46,7 @@ def run_lda():
   opt = {
     "data_path": DATA_PATH,
     "processed_data_dir": PROCESSED_DATA_DIR,
+    "skip_preprocess":True,
   }
   lda = CuLDA(opt)
   lda.train_model()

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ def run_lda():`
`46`	`46`	`opt = {`
`47`	`47`	`"data_path": DATA_PATH,`
`48`	`48`	`"processed_data_dir": PROCESSED_DATA_DIR,`
	`49`	`+ "skip_preprocess":True,`
`49`	`50`	`}`
`50`	`51`	`lda = CuLDA(opt)`
`51`	`52`	`lda.train_model()`