PaddlePaddle
diff --git a/‎paddle/fluid/framework/lod_tensor.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/framework/lod_tensor.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/lod_tensor.h
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/framework/lod_tensor.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/crf_decoding_op.h
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/crf_decoding_op.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/math/context_project.h
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/operators/math/context_project.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
Lines changed: 3 additions & 3 deletions b/‎paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎paddle/fluid/operators/sequence_ops/sequence_concat_op.h
Lines changed: 5 additions & 1 deletion b/‎paddle/fluid/operators/sequence_ops/sequence_concat_op.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/sequence_ops/sequence_expand_op.h
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/sequence_ops/sequence_expand_op.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/sequence_ops/sequence_slice_op.h
Lines changed: 5 additions & 3 deletions b/‎paddle/fluid/operators/sequence_ops/sequence_slice_op.h
Lines changed: 5 additions & 3 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
Lines changed: 17 additions & 4 deletions b/‎python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
Lines changed: 17 additions & 4 deletions
@@ -158,7 +158,7 @@ bool CheckLoD(const LoD &in, int tensor_height) {
     if (level.size() < 2) return false;
     // check: the first offset(the begin offset) of each level should be 0.
     if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(allow same items)
+    // check: all the offsets in a level should be non-descending
     if (!std::is_sorted(level.begin(), level.end())) {
       return false;
     }
@@ -182,7 +182,7 @@ bool CheckAbsLoD(const LoD &in, int tensor_height) {
   if (in.empty()) return true;
   for (const auto &level : in) {
     // check: all the offsets in a level should be ascending(no same items
-    // allows).
+    // allowed).
     if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
           if (a < b) return true;
           return false;
 
@@ -79,7 +79,7 @@ bool operator==(const LoD& a, const LoD& b);
  *
  * It will check two things:
  *
- *  1. all the offsets in a level should be ascending(no same items allows).
+ *  1. all the offsets in a level should be non-descending.
  *  2. there should be more than 2 offsets existing in each level.
  *  3. the higher level's last offset should equals the lower level's size-1.
  *  4. the first offset(the begin offset) of each level should be 0.
@@ -95,7 +95,7 @@ bool CheckLoD(const LoD& in, int tensor_height = -1);
  *   - Empty lod is treated as valid.
  *
  * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items allows)
+ *  1. all the offsets in a level should be ascending(no same items allowed).
  *  2. there should be more than 2 offsets existing in each level.
  *  3. the first offset of each level should be 0, and the last should be the
  *     same(the height of underlying tensor) or `tensor_height` if
 
@@ -46,6 +46,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     math::SetConstant<DeviceContext, int64_t>()(
         ctx.template device_context<DeviceContext>(), decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
+      if (lod[level][i] == lod[level][i + 1]) continue;
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
       Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
 
@@ -104,6 +104,8 @@ class ContextProjectFunctor {
     sequence_width = in.dims()[1];
 
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
       input_row_begin = (context_start > 0)
                             ? static_cast<int>(lod_level_0[i]) + context_start
                             : static_cast<int>(lod_level_0[i]);
@@ -134,6 +136,8 @@ class ContextProjectFunctor {
     if (padding_trainable) {
       PADDLE_ENFORCE_NOT_NULL(padding_data);
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
         Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
                                   static_cast<int>(lod_level_0[i + 1]));
 
@@ -216,6 +220,8 @@ class ContextProjectGradFunctor {
 
     if (input_grad) {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
         input_row_begin = (context_start > 0)
                               ? static_cast<int>(lod_level_0[i]) + context_start
                               : static_cast<int>(lod_level_0[i]);
@@ -248,6 +254,8 @@ class ContextProjectGradFunctor {
     if (pad_grad) {
       if (padding_trainable) {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+          if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
           Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
                                     static_cast<int>(lod_level_0[i + 1]));
 
 
@@ -197,9 +197,9 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
     threads = dim3(frame_per_block, 1);
     grid = dim3(frame_blocks, 1);
   } else {
-    /* frame_per_block = 32 batch_per_block = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    /* frame_per_block = 32 batch_per_block = 16 */
+    threads = dim3(32, 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
   }
 
   auto stream =
 
@@ -34,7 +34,9 @@ inline framework::LoD ConcatLoD(const Container &xs,
     for (size_t j = 0; j < xs.size(); ++j) {
       auto &x_lod = xs[j].get().lod()[0];
       const framework::Tensor &tensor = xs[j].get();
-      xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
+      }
       sum += x_lod[i];
     }
     result[i] = sum;
@@ -97,6 +99,8 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
         const framework::LoDTensor *x = xs[j];
         framework::LoDTensor *dx = dxs[j];
         auto &x_lod = x->lod()[0];
+        if (x_lod[i - 1] == x_lod[i]) continue;
+
         sliced_x.emplace_back(x->Slice(x_lod[i - 1], x_lod[i]));
         if (dx != nullptr) {
           sliced_dx.emplace_back(dx->Slice(x_lod[i - 1], x_lod[i]));
 
@@ -47,8 +47,10 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     out->set_lod(in->lod());
     auto out_data = out->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      if (lod0[i] == lod0[i + 1]) continue;
       int start = lod0[i];
       int end = lod0[i + 1];
+
       int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
       int mid = end + 1 - copy_size;
       int pad_num = win_size - copy_size;
 
@@ -160,6 +160,7 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
         int x_start = x_lod[i - 1];
         int x_end = x_lod[i];
         int x_seq_len = x_end - x_start;
+        if (x_seq_len == 0) continue;
         auto dx_sub = dx->Slice(x_start, x_end);
         dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
         int dout_end = dout_offset + repeat_num * x_seq_len;
 
@@ -76,9 +76,9 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     for (size_t i = 0; i < n; ++i) {
       PADDLE_ENFORCE_LE(0, offset_data[i],
-                        "The offset[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(0, length_data[i],
-                        "The length[%d] must greater than zero.", i);
+                        "The offset[%d] must be nonnegative.", i);
+      PADDLE_ENFORCE_LE(0, length_data[i],
+                        "The length[%d] must be nonnegative.", i);
       PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                         lod[0][i + 1], "The target tensor's length overflow.");
     }
@@ -95,6 +95,7 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     size_t out_offset = 0;
     for (size_t i = 0; i < n; ++i) {
+      if (length_data[i] == 0) continue;
       Tensor in_t = in->Slice(
           static_cast<int>(lod[0][i] + offset_data[i]),
           static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
@@ -144,6 +145,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
                static_cast<T>(0));
 
       for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+        if (length_data[i] == 0) continue;
         Tensor out_grad_t =
             out_grad->Slice(static_cast<int>(out_lod[0][i]),
                             static_cast<int>(out_lod[0][i + 1]));
 
@@ -128,12 +128,15 @@ class TestCRFDecodingOp2(OpTest):
     ground truth being given.
     """
 
+    def init_lod(self):
+        self.lod = [[1, 2, 3, 4]]
+
     def setUp(self):
         self.op_type = "crf_decoding"
         TAG_NUM = 5
 
-        lod = [[1, 2, 3, 4]]
-        total_len = sum(lod[-1])
+        self.init_lod()
+        total_len = sum(self.lod[-1])
         transition = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
@@ -152,9 +155,9 @@ def setUp(self):
         expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
-            "Emission": (emission, lod),
+            "Emission": (emission, self.lod),
             "Transition": transition,
-            "Label": (labels, lod)
+            "Label": (labels, self.lod)
         }
 
         self.outputs = {"ViterbiPath": expected_output}
@@ -163,5 +166,15 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestCRFDecodingOp3(TestCRFDecodingOp2):
+    def init_lod(self):
+        self.lod = [[1, 0, 0, 4]]
+
+
+class TestCRFDecodingOp4(TestCRFDecodingOp2):
+    def init_lod(self):
+        self.lod = [[0, 2, 3, 0]]
+
+
 if __name__ == "__main__":
     unittest.main()
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ bool operator==(const LoD& a, const LoD& b);`
`79`	`79`	`*`
`80`	`80`	`* It will check two things:`
`81`	`81`	`*`
`82`		`- * 1. all the offsets in a level should be ascending(no same items allows).`
	`82`	`+ * 1. all the offsets in a level should be non-descending.`
`83`	`83`	`* 2. there should be more than 2 offsets existing in each level.`
`84`	`84`	`* 3. the higher level's last offset should equals the lower level's size-1.`
`85`	`85`	`* 4. the first offset(the begin offset) of each level should be 0.`
`@@ -95,7 +95,7 @@ bool CheckLoD(const LoD& in, int tensor_height = -1);`
`95`	`95`	`* - Empty lod is treated as valid.`
`96`	`96`	`*`
`97`	`97`	`* It will check two things:`
`98`		`- * 1. all the offsets in a level should be ascending(no same items allows)`
	`98`	`+ * 1. all the offsets in a level should be ascending(no same items allowed).`
`99`	`99`	`* 2. there should be more than 2 offsets existing in each level.`
`100`	`100`	`* 3. the first offset of each level should be 0, and the last should be the`
`101`	`101`	* same(the height of underlying tensor) or `tensor_height` if