follow comments.

lcy-seso · lcy-seso · commit 2ac9a3d8dcc6 · 2017-10-31T20:05:00.000+08:00
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
@@ -235,7 +235,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      "The start row index must be smaller than the end row index.");
+      "The start row index must be lesser than the end row index.");
 
   if (dims_[0] == 1) {
     return *this;
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
@@ -26,17 +26,16 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         "Emission",
         "(LoDTensor, default: LoDTensor<float>). "
         "The unscaled emission weight matrix for the linear chain CRF. "
-        "This input is a LoDTensor with shape [N x D] where N is the total "
-        "element number of all input squences in a mini-batch, "
-        "and D is the total tag number.");
+        "This input is a LoDTensor with shape [N x D] where N is the size of "
+        "the mini-batch and D is the total tag number.");
     AddInput(
         "Transition",
         "(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
         "The learnable parameter for the linear_chain_crf operator. "
         "See more details in the operator's comments.");
     AddInput(
         "Label",
-        "(LoDTensor, default: LoDTensor<int>). The groundtruth which is a 2-D "
+        "(LoDTensor, default: LoDTensor<int>). The ground truth which is a 2-D "
         "LoDTensor with shape [N x 1], where N is the total element number in "
         "a mini-batch.");
     AddOutput(
@@ -77,12 +76,13 @@ variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
 
 Linear chain CRF is a special case of CRF that is useful for sequence labeling
 task. Sequence labeling tasks do not assume a lot of conditional
-independences among inputs. They only concern about the input and the output
-being linear sequences. Thus, the graph model of such a CRF is a simple chain
-or a line, which results in the linear chain CRF.
+independences among inputs. The only constraint they impose is that the input
+and output must be linear sequences. Thus, the graph of such a CRF is a simple
+chain or a line, which results in the linear chain CRF.
 
 This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
 
 Equation:
 
@@ -111,7 +111,7 @@ likelihood of each training sample in a mini-batch.
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.
 
-2. Because this operator performs globally normaliztion over all possible
+2. Because this operator performs global normalization over all possible
 sequences internally, it expects UNSCALED emission feature weights.
 Please do not call this op with the emission feature being output of any
 nonlinear activation.
@@ -171,9 +171,10 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Alpha", emission_dims);
     ctx->SetOutputDim("EmissionExps", emission_dims);
     ctx->SetOutputDim("TransitionExps", transition_dims);
-    // (TODO caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
+    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
     // is the sequence number in a mini-batch. The dimension set here should be
-    // resized to its correct size in the function Compute.
+    // resized to its correct size in the function Compute. Fix this once we can
+    // get LoD information in the InferShape interface.
     ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
   }
 
@@ -236,7 +237,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
-  // operator is determined by its input: graidents of LogLikelihood.
+  // operator is determined by its input: gradients of LogLikelihood.
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
@@ -188,7 +188,6 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
                             const LoDTensor& src, LoDTensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
       dst->CopyFrom(src, platform::CPUPlace(), ctx);
-
     };
 
     copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -248,7 +247,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < tag_num; ++i) {
         T sum = 0.;
         for (size_t j = 0; j < tag_num; ++j) {
-          sum += alpha_value[(k - 1) * tag_num + j] *
+          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
                  w_exps[(j + state_trans_base_idx) * tag_num + i];
         }
         alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
@@ -291,7 +290,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // These local variables hold the inputs and outputs, garanteeing them on
     // CPU memory, to provide a consistent reference.
     // TODO(caoying) Fix this by moving all these local variables into the
-    // class's data members once we can profile the training process.
+    // class's data members once we can profile the training process, or
+    // implementing a real GPU kernel for CRF.
     Tensor* label = nullptr;
     Tensor label_tensor;
     Tensor* emission_exps = nullptr;
@@ -344,6 +344,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       transition_grad =
           ctx.Output<Tensor>(framework::GradVarName("Transition"));
     }
+
+    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
+    // data reader operator, it can have no gradients.
     PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
     emission_grad->mutable_data<T>(platform::CPUPlace());
     math::SetConstant<platform::CPUPlace, T>()(ctx.device_context(),
@@ -458,7 +461,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < tag_num; ++i) {
         T sum = 0.;
         for (size_t j = 0; j < tag_num; ++j) {
-          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *
+          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
                  x_exps[(k + 1) * tag_num + j] *
                  beta_value[(k + 1) * tag_num + j];
         }
@@ -493,7 +496,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
 
       auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
 
-      // TODO(caoying): Fix this to avoid using this local variable.
+      // TODO(caoying): Fix this to avoid using this local variable if when can
+      // profiling the training process.
       Tensor tmp;
       tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
       auto tmp_mat = EigenMatrix<T>::From(tmp);
diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
@@ -83,6 +83,9 @@ def crf_forward_compute(self):
 
 class TestLinearChainCrfOp(OpTest):
     def set_test_data(self):
+        # TODO(caoying) Fix the unittest by: add the boundary cases when
+        # sequence lengths are 1, 2, and 3.
+
         SEQ_NUM = 3
         TAG_NUM = 17
         MAX_SEQ_LEN = 5