implement InitRngsKernel

js1010 · js1010 · commit 084c4cb27c2a · 2021-02-12T23:21:01.000+09:00
diff --git a/cpp/include/cuw2v/cuw2v.hpp b/cpp/include/cuw2v/cuw2v.hpp
@@ -80,8 +80,7 @@ class CuW2V {
 
   // variables to construct random table
   thrust::device_vector<int> dev_random_table_;
-  int random_size_, table_seed_;
-  std::mt19937 table_rng_;
+  int random_size_, table_seed_, cuda_seed_;
   thrust::device_vector<default_random_engine> dev_rngs_;
 };
 
diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -208,4 +208,8 @@ float ReduceSum(const float* vec, const int length) {
   return shared[0];
 }
 
+__global__ void InitRngsKernel(default_random_engine* rngs, int rand_seed) {
+  rngs[blockIdx.x].seed(blockIdx.x + rand_seed);
+}
+
 } // namespace cusim
diff --git a/cpp/src/cuw2v/cuw2v.cu b/cpp/src/cuw2v/cuw2v.cu
@@ -46,11 +46,13 @@ bool CuW2V::Init(std::string opt_path) {
   // if zero, we will use hierarchical softmax
   neg_ = opt_["negative_sampling"].int_value(); 
   
-  // set seed for constructing random table of negative sampling
+  // random seed 
   table_seed_ = opt_["table_seed"].int_value();
-  const unsigned int table_seed = table_seed_;
-  table_rng_.seed(table_seed);
-  
+  cuda_seed_ = opt_["cuda_seed"].int_value();
+  dev_rngs_.resize(block_cnt_);
+  InitRngsKernel<<<block_cnt_, 1>>>(
+    thrust::raw_pointer_cast(dev_rngs_.data()), cuda_seed_);
+
   INFO("num_dims: {}, block_dim: {}, block_cnt: {}, objective type: {}, neg: {}", 
       num_dims_, block_dim_, block_cnt_, sg_? "skip gram": "cbow", neg_);
   return true;
@@ -63,22 +65,25 @@ void CuW2V::BuildRandomTable(const float* word_count, const int num_words,
   std::vector<float> acc;
   float cumsum = 0;
   for (int i = 0; i < num_words; ++i) {
-    cumsum += word_count[i];
     acc.push_back(cumsum);
+    cumsum += word_count[i];
   }
 
-  std::uniform_real_distribution<float> dist(0.0f, cumsum);
   dev_random_table_.resize(random_size_);
   std::vector<int> host_random_table(table_size);
   #pragma omp parallel num_threads(num_threads)
   {
+    const unsigned int table_seed = table_seed_ + omp_get_thread_num();
+    std::mt19937 rng(table_seed);
+    std::uniform_real_distribution<float> dist(0.0f, cumsum);
     #pragma omp for schedule(static)
     for (int i = 0; i < random_size_; ++i) {
-      float r = dist(table_rng_);
+      float r = dist(rng);
       int pos = std::lower_bound(acc.begin(), acc.end(), r) - acc.begin();
       host_random_table[i] = pos;
     }
   }
+  table_seed_ += num_threads;
 
   thrust::copy(host_random_table.begin(), host_random_table.end(), dev_random_table_.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
@@ -148,6 +153,8 @@ void CuW2V::BuildHuffmanTree(const float* word_count, const int num_words) {
   thrust::copy(host_points.begin(), host_points.end(), dev_points_.begin());
   thrust::copy(host_hs_indptr.begin(), host_hs_indptr.end(), dev_hs_indptr_.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
+  
+  huffman_nodes.clear();
 }
 
 void CuW2V::LoadModel(float* emb_in, float* emb_out) {

Original file line number	Diff line number	Diff line change
`@@ -208,4 +208,8 @@ float ReduceSum(const float* vec, const int length) {`
`208`	`208`	`return shared[0];`
`209`	`209`	`}`
`210`	`210`
	`211`	`+__global__ void InitRngsKernel(default_random_engine* rngs, int rand_seed) {`
	`212`	`+ rngs[blockIdx.x].seed(blockIdx.x + rand_seed);`
	`213`	`+}`
	`214`	`+`
`211`	`215`	`} // namespace cusim`