Remove 'top 1' from CPU and GPU kernel

wanghaoshuang · wanghaoshuang · commit 281e93bcbb3e · 2018-01-16T15:15:10.000+08:00
1. Remove 'top 1'(or argmax) from CPU and GPU kernel
2. Add a new test case
3. Refine doc
diff --git a/paddle/operators/ctc_decode_op.cc b/paddle/operators/ctc_decode_op.cc
@@ -29,14 +29,8 @@ class CTCGreedyDecodeOp : public framework::OperatorWithKernel {
 
     auto input_dims = ctx->GetInputDim("Input");
 
-    int sequence_width =
-        static_cast<int>(framework::product(input_dims) / input_dims[0]);
-    int blank = ctx->Attrs().Get<int>("blank");
-    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
-                   "The value of Attr(blank) should be in interval [0, %d).",
-                   sequence_width);
     // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Output", {input_dims[0], 1});
+    ctx->SetOutputDim("Output", input_dims);
   }
 
  protected:
@@ -53,25 +47,37 @@ class CTCGreedyDecodeOpMaker : public framework::OpProtoAndCheckerMaker {
   CTCGreedyDecodeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
-             "(LodTensor, default: LoDTensor<float>), the unscaled "
-             "probabilities of variable-length sequences, which is a 2-D "
-             "Tensor with LoD information. It's shape is "
-             "[Lp, num_classes + 1], where Lp is the sum of all input "
-             "sequences' length and num_classes is the true number of classes "
-             "(not including the blank label).");
-    AddOutput("Output", "(Tensor, default: Tensor<int>), the decode result ");
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
+             "[Lp, 1], where Lp is the sum of all input sequences' length.");
+    AddOutput("Output", "(Tensor, default: Tensor<int>), The decode result.");
     AddAttr<int>("blank",
                  "(int, default: 0), the blank label setted in Connectionist "
-                 "Temporal Classification (CTC) op, and it is in the "
-                 "half-opened interval [0, num_classes + 1).")
+                 "Temporal Classification (CTC) op.")
         .SetDefault(0);
     AddAttr<bool>("merge_repeated",
                   "(bool, default: true), whether to "
                   "merge repeated elements between two blanks. ")
         .SetDefault(true);
     AddComment(R"DOC(
-CTCGreedyDecoder is an implementation of the simple best path decoding
-algorithm, selecting at each timestep the most likely class at each timestep.
+CTCDecoder is used to merge repeated elements between two blanks
+and then delete all blanks in sequence.
+
+Given:
+    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
+                  6, 0, 0, 7, 7, 7, 0]
+    Input.dims = {18, 1}
+    Input.LoD = [[0, 11, 18]]
+
+And:
+    blank = 0
+    merge_repeated = True
+
+Then:
+    Output.data = [1, 2, 4, 4, 5, 6,
+                   6, 7]
+    Output.dims = {8, 1}
+    Output.LoD = [[0, 6, 8]]
+
 )DOC");
   }
 };
@@ -85,4 +91,4 @@ REGISTER_OPERATOR(ctc_greedy_decode, ops::CTCGreedyDecodeOp,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     ctc_greedy_decode,
-    ops::CTCGreedyDecodeKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::CTCGreedyDecodeKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/operators/ctc_decode_op.cu b/paddle/operators/ctc_decode_op.cu
@@ -16,62 +16,20 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include "paddle/operators/ctc_greedy_decode_op.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-__device__ static float atomicMaxF(float* address, float val) {
-  int* address_as_i = (int*)address;
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = ::atomicCAS(address_as_i, assumed,
-                      __float_as_int(::fmaxf(val, __int_as_float(assumed))));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-template <typename T, int BlockSize>
-__global__ void ArgmaxCudaKernel(const size_t seq_width, const T* logits,
-                                 int* output) {
-  T local_max_value = 0;
-  int local_max_index = 0;
-  __shared__ T max_value;
-  if (threadIdx.x == 0) {
-    max_value = 0;
-  }
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < seq_width; i += BlockSize) {
-    T value = logits[blockIdx.x * seq_width + i];
-    if (value > local_max_value) {
-      local_max_value = value;
-      local_max_index = i;
-    }
-  }
-
-  atomicMaxF(&max_value, local_max_value);
-
-  __syncthreads();
-
-  if (local_max_value == max_value) {
-    output[blockIdx.x] = local_max_index;
-  }
-}
 
 template <typename T>
-__global__ void MergeAndDelCudaKernel(const int64_t num_token, int* tokens,
+__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
                                       const size_t num_seq, size_t* lod0,
                                       const int blank, const int merge_repeated,
-                                      size_t* out_lod0, int* output) {
+                                      size_t* out_lod0, T* output) {
   int ouput_idx = 0;
   out_lod0[0] = 0;
 
   for (int i = 0; i < num_seq; ++i) {
-    int pre_token = -1;
+    T pre_token = -1;
     for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
       if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
         output[ouput_idx] = tokens[j];
@@ -89,44 +47,39 @@ class CTCGreedyDecodeOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
+    const size_t level = 0;
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
 
+    const T* tokens = input->data<T>();
     const int64_t num_tokens = input->dims()[0];
-    const size_t seq_width = input->numel() / num_tokens;
-    const T* logits = input->data<T>();
-    Tensor tmp;
-    int* tokens = tmp.mutable_data<int>({num_tokens, 1}, ctx.GetPlace());
-    // get argmax
-    // platform::GpuMemsetAsync(args, 0, sizeof(float), stream);
-
-    auto stream = ctx.cuda_device_context().stream();
-    ArgmaxCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_tokens, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(seq_width, logits,
-                                                          tokens);
-
-    const size_t level = 0;
-    auto input_lod = framework::ToAbsOffset(input->lod());
     const size_t num_seq = input_lod[level].size() - 1;
+
     const int blank = ctx.Attr<int>("blank");
     const int merge_repeated =
         static_cast<int>(ctx.Attr<bool>("merge_repeated"));
 
+    // prepare a lod to record lod information while merging elements
     thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
     size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
 
-    int* output_data =
-        output->mutable_data<int>({num_tokens, 1}, ctx.GetPlace());
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
     MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
         num_tokens, tokens, num_seq, input_lod[level].data(), blank,
         merge_repeated, dev_out_lod0_ptr, output_data);
 
+    // set output lod
     thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
                                               dev_out_lod0.end());
     framework::LoD out_lod;
     out_lod.push_back(host_out_lod0);
     output->set_lod(out_lod);
 
+    // resize output dims
     output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
   }
 };
@@ -135,4 +88,4 @@ class CTCGreedyDecodeOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(ctc_greedy_decode,
-                        paddle::operators::CTCGreedyDecodeOpCUDAKernel<float>);
+                        paddle::operators::CTCGreedyDecodeOpCUDAKernel<int>);
diff --git a/paddle/operators/ctc_decode_op.h b/paddle/operators/ctc_decode_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <string.h>
 #include "paddle/framework/op_registry.h"
-#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace operators {
 
@@ -30,47 +29,46 @@ class CTCGreedyDecodeKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* output = ctx.Output<LoDTensor>("Output");
     const size_t level = 0;
-
     auto input_lod = framework::ToAbsOffset(input->lod());
+
+    // check input dims and lod
     auto input_dims = input->dims();
     PADDLE_ENFORCE_EQ(input_dims[0],
                       static_cast<int64_t>(input_lod[level].back()),
                       "The first dimension of Input(Input) should be equal to "
                       "the sum of all sequences' lengths.");
 
     const size_t num_sequences = input_lod[level].size() - 1;
-    const size_t sequence_width = input->numel() / input_dims[0];
     size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
     bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+
+    // merge repeated tokens and delete blank
     std::vector<std::vector<int>> pathes(num_sequences);
     std::vector<size_t> output_lod0(1, 0);
-
     const T* input_data = input->data<T>();
-    Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-    input_mat(const_cast<T*>(input_data), input->numel() / sequence_width,
-              sequence_width);
-
-    size_t max_class_idx;
-    size_t prev_class_idx = -1;
     for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
       for (size_t i = input_lod[level][seq_idx];
            i < input_lod[level][seq_idx + 1]; ++i) {
-        input_mat.row(i).maxCoeff(&max_class_idx);
-        if (max_class_idx != blank &&
-            !(merge_repeated && max_class_idx == prev_class_idx)) {
-          pathes[seq_idx].push_back(max_class_idx);
+        if (input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          pathes[seq_idx].push_back(input_data[i]);
         }
-        prev_class_idx = max_class_idx;
+        prev_token = input_data[i];
       }
       output_lod0.push_back(output_lod0.back() + pathes[seq_idx].size());
     }
+
+    // set output lod
     framework::LoD output_lod;
     output_lod.push_back(output_lod0);
     output->set_lod(output_lod);
-    int64_t num_step = static_cast<int64_t>(output_lod0.back());
-    int* output_data = output->mutable_data<int>({num_step, 1}, ctx.GetPlace());
 
+    // resize output dims
+    T* output_data = output->mutable_data<T>(
+        {static_cast<int64_t>(output_lod0.back()), 1}, ctx.GetPlace());
+
+    // copy result to output
     for (int i = 0; i < num_sequences; ++i) {
       memcpy(output_data + output_lod0[i], pathes[i].data(),
              sizeof(int) * pathes[i].size());
diff --git a/python/paddle/v2/fluid/tests/test_ctc_decode.py b/python/paddle/v2/fluid/tests/test_ctc_decode.py
@@ -0,0 +1,62 @@
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+def CTCDecode(input, lod, blank, merge_repeated):
+    lod0 = lod[0]
+    result = []
+    for i in range(len(lod0) - 1):
+        prev_token = -1
+        for j in range(lod0[i], lod0[i + 1]):
+            token = input[j][0]
+            if (token != blank) and not (merge_repeated and
+                                         token == prev_token):
+                result.append(token)
+            prev_token = token
+    result = np.array(result).reshape([len(result), 1]).astype("int32")
+    return result
+
+
+class TestCTCDecodeOp(OpTest):
+    def config(self):
+        self.op_type = "ctc_greedy_decode"
+        self.input_lod = [[0, 11, 18]]
+        self.blank = 0
+        self.merge_repeated = False
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
+                [18, 1]).astype("int32")
+
+    def setUp(self):
+        self.config()
+        output = CTCDecode(self.input, self.input_lod, self.blank,
+                           self.merge_repeated)
+
+        self.inputs = {"Input": (self.input, self.input_lod), }
+        self.outputs = {"Output": output}
+        self.attrs = {
+            "blank": self.blank,
+            "merge_repeated": self.merge_repeated
+        }
+
+    def test_check_output(self):
+        self.check_output()
+        pass
+
+
+class TestCTCDecodeOpCase1(TestCTCDecodeOp):
+    def config(self):
+        self.op_type = "ctc_greedy_decode"
+        self.input_lod = [[0, 11, 18]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
+                [18, 1]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_ctc_greedy_decode.py b/python/paddle/v2/fluid/tests/test_ctc_greedy_decode.py