train only train samples

js1010 · js1010 · commit bab04b0b36e2 · 2021-02-10T22:37:55.000+09:00
diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -26,17 +26,19 @@ float Digamma(float x) {
 }
 
 __global__ void EstepKernel(
-  const int* cols, const int* indptr, 
+  const int* cols, const int* indptr, const bool* vali,
   const int num_cols, const int num_indptr,
   const int num_words, const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   float* alpha, float* beta,
-  float* grad_alpha, float* new_beta) {
+  float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
   
   // storage for block
   float* _gamma = gamma + num_topics * blockIdx.x;
   float* _new_gamma = new_gamma + num_topics * blockIdx.x;
   float* _phi = phi + num_topics * blockIdx.x;
+  float* _grad_alpha = grad_alpha + num_topics * blockIdx.x;
+
 
   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
     int beg = indptr[i], end = indptr[i + 1];
@@ -56,18 +58,34 @@ __global__ void EstepKernel(
       // compute phi from gamma
       for (int k = beg; k < end; ++k) {
         const int w = cols[k];
+        const bool _vali = vali[k];
+
         // compute phi
-        for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
-          _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
-        __syncthreads();
-        
-        // normalize phi and add it to new gamma and new beta
-        float phi_sum = ReduceSum(_phi, num_topics);
-        for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
-          _phi[l] /= phi_sum;
-          _new_gamma[l] += _phi[l];
-          if (j + 1 == num_iters) 
-            new_beta[w * num_topics + l] += phi[l];
+        if (not _vali or j + 1 == num_iters) {
+          for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
+            _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
+          __syncthreads();
+          
+          // normalize phi and add it to new gamma and new beta
+          float phi_sum = ReduceSum(_phi, num_topics);
+          for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
+            _phi[l] /= phi_sum;
+            if (not _vali) _new_gamma[l] += _phi[l];
+            if (j + 1 == num_iters) { 
+              if (not _vali) new_beta[w * num_topics + l] += _phi[l];
+              _phi[l] *= beta[w * num_topics + l];
+            }
+          }
+          __syncthreads();
+        }
+        if (j + 1 == num_iters) {
+          float p = ReduceSum(_phi, num_topics);
+          if (threadIdx.x == 0) {
+            if (_vali)
+              vali_losses[blockIdx.x] += logf(p + EPS);
+            else
+              train_losses[blockIdx.x] += logf(p + EPS);
+          } 
         }
         __syncthreads();
       }
@@ -79,7 +97,8 @@ __global__ void EstepKernel(
     }
     float gamma_sum = ReduceSum(_gamma, num_topics);
     for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
-      grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
+      _grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
+
     __syncthreads();
   } 
 }
diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
@@ -64,7 +64,8 @@ class CuLDA {
   bool Init(std::string opt_path);
   void LoadModel(float* alpha, float* beta,
       float* grad_alpha, float* new_beta, const int num_words);
-  void FeedData(const int* indices, const int* indptr,
+  std::pair<float, float> FeedData(
+      const int* indices, const int* indptr, const bool* vali,
       const int num_indices, const int num_indptr, const int num_iters);
   void Pull();
   void Push();
diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp
@@ -11,3 +11,4 @@ struct DeviceInfo {
 };
 
 #define WARP_SIZE 32
+#define EPS 1e-10f
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
@@ -61,27 +61,50 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
-void CuLDA::FeedData(const int* cols, const int* indptr, 
+std::pair<float, float> CuLDA::FeedData(
+    const int* cols, const int* indptr, const bool* vali,
     const int num_cols, const int num_indptr, const int num_iters) {
+  
+  // copy feed data to GPU memory
   thrust::device_vector<int> dev_cols(num_cols);
   thrust::device_vector<int> dev_indptr(num_indptr + 1);
+  thrust::device_vector<bool> dev_vali(num_cols);
+  thrust::device_vector<float> dev_train_losses(block_cnt_, 0.0f);
+  thrust::device_vector<float> dev_vali_losses(block_cnt_, 0.0f);
   thrust::copy(cols, cols + num_cols, dev_cols.begin());
   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
+  thrust::copy(vali, vali + num_cols, dev_vali.begin());
+  
   CHECK_CUDA(cudaDeviceSynchronize());
 
+  // run E step in GPU
   EstepKernel<<<block_cnt_, block_dim_>>>(
     thrust::raw_pointer_cast(dev_cols.data()),
     thrust::raw_pointer_cast(dev_indptr.data()),
+    thrust::raw_pointer_cast(dev_vali.data()),
     num_cols, num_indptr, num_words_, num_topics_, num_iters,
     thrust::raw_pointer_cast(dev_gamma_.data()),
     thrust::raw_pointer_cast(dev_new_gamma_.data()),
     thrust::raw_pointer_cast(dev_phi_.data()),
     thrust::raw_pointer_cast(dev_alpha_.data()),
     thrust::raw_pointer_cast(dev_beta_.data()),
     thrust::raw_pointer_cast(dev_grad_alpha_.data()),
-    thrust::raw_pointer_cast(dev_new_beta_.data()));
+    thrust::raw_pointer_cast(dev_new_beta_.data()),
+    thrust::raw_pointer_cast(dev_train_losses.data()),
+    thrust::raw_pointer_cast(dev_vali_losses.data()));
   
   CHECK_CUDA(cudaDeviceSynchronize());
+
+  // pull loss
+  std::vector<float> train_losses(block_cnt_), vali_losses(block_cnt_);
+  thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin());
+  thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin());
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  // accumulate
+  float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f);
+  float vali_loss = std::accumulate(vali_losses.begin(), vali_losses.end(), 0.0f);
+  return {train_loss, vali_loss};
 }
 
 void CuLDA::Pull() {
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
@@ -14,6 +14,7 @@ namespace py = pybind11;
 
 typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
 typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
+typedef py::array_t<bool, py::array::c_style | py::array::forcecast> bool_array;
 
 class CuLDABind {
  public:
@@ -54,17 +55,21 @@ class CuLDABind {
         _new_beta.mutable_data(0), num_words);
   }
 
-  void FeedData(py::object& cols, py::object indptr, const int num_iters) {
+  std::pair<float, float> FeedData(py::object& cols, py::object& indptr, py::object& vali, const int num_iters) {
     int_array _cols(cols);
     int_array _indptr(indptr);
+    bool_arrray _vali(vali);
     auto cols_buffer = _cols.request();
     auto indptr_buffer = _indptr.request();
-    if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
+    auto vali_buffer = _vali.request();
+    if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1 or vali_buffer.ndim != 1
+        or cols_buffer.shape[0] != vali_buffer.shape[0]) {
       throw std::runtime_error("invalid cols or indptr");
     }
     int num_cols = cols_buffer.shape[0];
     int num_indptr = indptr_buffer.shape[0];
-    obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr, num_iters);
+    return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0),
+        num_cols, num_indptr, num_iters);
   }
 
   void Pull() {
@@ -93,7 +98,7 @@ PYBIND11_PLUGIN(culda_bind) {
       py::arg("alpha"), py::arg("beta"),
       py::arg("grad_alpha"), py::arg("new_beta"))
   .def("feed_data", &CuLDABind::FeedData,
-      py::arg("cols"), py::arg("indptr"), py::arg("num_iters"))
+      py::arg("cols"), py::arg("indptr"), py::arg("vali"), py::arg("num_iters"))
   .def("pull", &CuLDABind::Pull)
   .def("push", &CuLDABind::Push)
   .def("get_block_cnt", &CuLDABind::GetBlockCnt)
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
@@ -92,7 +92,9 @@ def train_model(self):
 
   def _train_e_step(self, h5f):
     offset, size = 0, h5f["cols"].shape[0]
-    pbar = aux.Progbar(size)
+    pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
+    train_loss_nume, train_loss_deno = 0, 0
+    vali_loss_nume, vali_loss_deno = 0, 0
     while True:
       target = h5f["indptr"][offset] + self.opt.batch_size
       if target < size:
@@ -103,10 +105,26 @@ def _train_e_step(self, h5f):
       beg, end = indptr[0], indptr[-1]
       indptr -= beg
       cols = h5f["cols"][beg:end]
+      vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
       offset = next_offset
 
-      self.obj.FeedData(cols, indptr, self.opt.num_iters_in_e_step)
-      pbar.update(end)
+      # call cuda kernel
+      train_loss, vali_loss = \
+        self.obj.FeedData(cols, indptr, vali, self.opt.num_iters_in_e_step)
+
+      # accumulate loss
+      train_loss_nume -= train_loss
+      vali_loss_nume -= vali_loss
+      vali_cnt = np.count_nonzero(vali)
+      train_cnt = len(vali) - vali_cnt
+      train_loss_nume += train_cnt
+      vali_loss_nume += train_cnt
+      train_loss = train_loss_nume / train_loss_deno
+      vali_loss = vali_loss_nume / vali_loss_deno
+
+      # update progress bar
+      pbar.update(end, values=[("train_loss", train_loss),
+                               ("vali_loss", vali_loss)])
       if end == size:
         break
 
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
@@ -66,6 +66,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
     cols = h5f.create_dataset("cols", shape=(chunk_indices,),
                               maxshape=(None,), dtype=np.int32,
                               chunks=(chunk_indices,))
+    vali = h5f.create_dataset("vali", shape=(chunk_indices,),
+                              maxshape=(None,), dtype=np.float32,
+                              chunks=(chunk_indices,))
     indptr =  h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
                                  dtype=np.int32, chunks=True)
     processed, offset = 1, 0
@@ -81,6 +84,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
       rows[offset:offset + data_size] = _rows + (processed - 1)
       cols.resize((offset + data_size,))
       cols[offset:offset + data_size] = _cols
+      vali.resize((offset + data_size,))
+      vali[offset:offset + data_size] = \
+        np.uniform(size=(data_size,)).astype(np.float32)
       indptr[processed:processed + read_lines] = _indptr + offset
       offset += data_size
       processed += read_lines
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
@@ -28,4 +28,5 @@ message CuLDAConfigProto {
   optional int32 batch_size = 10 [default = 100000];
   optional int32 epochs = 11 [default = 10];
   optional int32 num_iters_in_e_step = 12 [default = 5];
+  optional double vali_p = 13 [default = 0.2];
 }

Original file line number	Diff line number	Diff line change
`@@ -11,3 +11,4 @@ struct DeviceInfo {`
`11`	`11`	`};`
`12`	`12`
`13`	`13`	`#define WARP_SIZE 32`
	`14`	`+#define EPS 1e-10f`
Original file line number	Diff line number	Diff line change
`@@ -28,4 +28,5 @@ message CuLDAConfigProto {`
`28`	`28`	`optional int32 batch_size = 10 [default = 100000];`
`29`	`29`	`optional int32 epochs = 11 [default = 10];`
`30`	`30`	`optional int32 num_iters_in_e_step = 12 [default = 5];`
	`31`	`+ optional double vali_p = 13 [default = 0.2];`
`31`	`32`	`}`