Rewrite sequence expand op

wanghaoshuang · wanghaoshuang · commit 296167446c35 · 2017-10-24T14:10:02.000+08:00
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
@@ -112,28 +112,5 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   lod_ = new_lod;
 }
 
-Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> indexes,
-                          Vector<size_t> scales, bool repeat) {
-  Vector<size_t> result;
-  result.push_back(level[0]);
-  size_t start = 0, end = 0;
-  if (!repeat) {
-    for (size_t i = 0; i < scales.size(); ++i) {
-      result.push_back(result.back() + scales[i] * (level[i + 1] - level[i]));
-    }
-  } else {
-    for (size_t i = 0; i < scales.size(); ++i) {
-      start = indexes[i];
-      end = indexes[i + 1];
-      for (size_t j = 0; j < scales[i]; ++j) {
-        for (size_t index = start; index < end - 1; ++index) {
-          result.push_back(result.back() + level[index + 1] - level[index]);
-        }
-      }
-    }
-  }
-  return result;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
@@ -136,8 +136,5 @@ class LoDTensor : public Tensor {
   LoD lod_;
 };
 
-Vector<size_t> expand_lod(Vector<size_t> level, Vector<size_t> indexes,
-                          Vector<size_t> scales, bool repeat);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
@@ -27,20 +27,14 @@ class SeqExpandOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SeqExpandOp should not be null.");
-    int repeat = ctx->Attrs().Get<int>("repeat");
-    framework::DDim out_dim;
-    if (repeat == 0) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("Y"),
-          "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
-      out_dim = ctx->GetInputDim("Y");
-      ctx->ShareLoD("Y", "Out");
-    } else {
-      out_dim = ctx->GetInputDim("X");
-      out_dim[0] = out_dim[0] * repeat;
-    }
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SeqExpandOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of SeqExpandOp should not be null while repeat == 0.");
+    framework::DDim out_dim;
+    out_dim = ctx->GetInputDim("Y");
+    ctx->ShareLoD("Y", "Out");
     ctx->SetOutputDim("Out", out_dim);
   }
 };
@@ -50,68 +44,63 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   SeqExpandOpMaker(framework::OpProto* proto,
                    framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "The input('X') of seq_expand op. It can be LoDTensor or base Tensor.");
-    AddInput(
-        "Y",
-        "The reference input('Y') of seq_expand op."
-        "It must be a LoDTensor with k-level(k>0)."
-        "This reference input is essential if 'repeat' attribute is not "
-        "configured."
-        "Input(X) will be expanded by LoD of input(Y) while repeat ==  0.");
+    AddInput("X",
+             "(Tensor or LoDTensor) The input('X') of this operator can be a "
+             "LoDTensor or a base Tensor.");
+    AddInput("Y",
+             "(LoDTensor)The reference input('Y') of seq_expand op."
+             "It must be a LoDTensor with k-level(k>0)."
+             "Input(X) will be expanded according to LOD of input(Y)."
+             "The element numbers of last level in input('Y') "
+             "must be equal to dims[0] of input('X').");
     AddOutput("Out",
               "The output of seq_expand op."
-              "The output is a (k+1)-level LoDTensor"
-              "while input(X) being k-level LoDTensor."
-              "(Given base tensor is 0-level LoDTensor.)");
-    AddAttr<int>("repeat",
-                 "(type:int; default value: 0)"
-                 "Repeatting times of each element while expanding input(X)."
-                 "It works while input(Y) is not configured.")
-        .SetDefault(0);
+              "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Expand k-level LoDTensor to (k+1)-level LoDTensor
-by lod of input(Y) or 'repeat' attribute.
+Expand input(X) according to LOD of input(Y).
 
 Case 1:
 
-Given a 2-level LoDTensor X:
-    X.data = [a, b , c, d]
-    X.lod = [[0, 3, 4], [0, 1, 3, 4]]
-and
-    repeat = 2
-then we get 3-level LoDTensor
-    Out.lod = [[0,                6,    8],
-               [0,       3,       6, 7, 8],
-               [0, 1,    3, 4,    6, 7, 8]]
-    Out.data = [a, b, c, a, b, c, d, d]
+Given 2-level a LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 7, 8]]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 7, 8]]
+    Out.data = [a, a, a, b, b, b, c, d]
+    Out.dims = [8, 1]
 
 Case 2:
 
-Given 2-level a LoDTensor X
-    X.data = [1, 2, 3, 4]
-    X.lod = [[0, 3, 4], [0, 1, 3, 4]]
-and
-    Y.lod = [[0, 6, 8],
-             [0, 3, 6, 7, 8],
-             [0,1,3,4,6,7,8]]
-then we get 3-level LoDTensor
-    Out.data = [1, 2, 3, 1, 2, 3, 4, 4]
-    Out.lod = [[0, 6, 8],
-               [0, 3, 6, 7, 8],
-               [0, 1, 3, 4, 6, 7, 8]]
+Given a 0-level LoDTensor input(X)
+    X.data = [a, b, c]
+    X.lod = NULL
+    X.dims = [3, 1]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+then we get 1-level LoDTensor
+    Out.lod = [[0,    2, 3,      6]]
+    Out.data = [a, a, b, c, c, c]
+    Out.dims = [6, 1]
 
 Case 3:
 
-Given a 0-level LoDTensor X
-    X.data = [1, 2, 3, 4]
+Given a 0-level LoDTensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
     X.lod = NULL
-and
-    repeat = 2
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
 then we get 1-level LoDTensor
-    Out.data = [1, 1, 2, 2, 3, 3, 4, 4]
-    Out.lod = [[0, 2, 4, 6, 8]]
+    Out.lod = [[0,           2,     3,                     6]]
+    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+
 
 )DOC");
   }
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
@@ -31,93 +31,28 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     auto* out = context.Output<LoDTensor>("Out");
     const T* x_data = x->data<T>();
     auto x_dims = x->dims();
-    auto x_lod = x->lod();
-
-    framework::Vector<size_t> level;
-    size_t num = (x_lod.size() == 0) ? (x->dims()[0] + 1) : x_lod[0].size();
-    for (int i = 0; i < num; ++i) {
-      level.push_back(i);
-    }
-    x_lod.push_back(level);
-
-    size_t repeat = static_cast<size_t>(context.Attr<int>("repeat"));
-    framework::Vector<size_t> scales;
-    if (repeat != 0) {
-      for (int i = 0; i < x_lod[0].size() - 1; ++i) {
-        scales.push_back(repeat);
-      }
-      std::vector<int64_t> dims = framework::vectorize(x->dims());
-      dims[0] = dims[0] * repeat;
-      auto out_dims = framework::make_ddim(dims);
-      out->Resize(out_dims);
-    } else {
-      auto* y = context.Input<LoDTensor>("Y");
-      auto y_lod = y->lod();
-      auto y_abs_lod = y_lod.ToAbsOffset();
-      auto x_abs_lod = x_lod.ToAbsOffset();
-      for (int i = 0; i < y_abs_lod[0].size() - 1; ++i) {
-        scales.push_back((y_abs_lod[0][i + 1] - y_abs_lod[0][i]) /
-                         (x_abs_lod[0][i + 1] - x_abs_lod[0][i]));
-      }
-      out->Resize(y->dims());
-    }
-
-    framework::Vector<size_t> indexes;
-    for (int size_t i = 0; i < x_lod[0]; ++i) {
-      indexes[i] = x_lod[0];
-    }
-    framework::LoD out_lod;
-    auto level0 = framework::expand_lod(indexes, x_lod[0], scales, false);
-    out_lod.push_back(level0);
-    for (int i = 1; i < x_lod.size(); ++i) {
-      for (int j = 0; j < indexes.size(); ++j) {
-        indexes[j] = x_lod[i - 1][indexes[j]];
-      }
-      out_lod.push_back(framework::expand_lod(x_lod[i], indexes, scales, true));
-    }
-
+    auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE_EQ(x_dims[0], y->lod().back().size() - 1,
+                      "The size of last lod level in Input(Y)"
+                      "must be equal to dims[0] of Input(X).");
+    out->set_lod(y->lod());
+    out->Resize(y->dims());
+    auto place = context.GetEigenDevice<Place>();
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    // copy data
-    auto place = context.GetPlace();
-    size_t count = 0;
-    if (platform::is_cpu_place(place)) {
-      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      for (size_t i = 0; i < scales.size(); ++i) {
-        count = element_len * (x_abs_lod[0][i + 1] - x_abs_lod[0][i]);
-        for (size_t j = 0; j < scales[i]; ++j) {
-          memory::Copy(cpu_place, out_data, cpu_place, x_data,
-                       sizeof(T) * count);
-          out_data += count;
-        }
-        x_data += count;
-      }
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::GPUPlace>(place);
-      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                        context.device_context())
-                        .stream();
-      for (size_t i = 0; i < scales.size(); ++i) {
-        count = element_len * (x_abs_lod[0][i + 1] - x_abs_lod[0][i]);
-        for (size_t j = 0; j < scales[i]; ++j) {
-          memory::Copy(gpu_place, out_data, gpu_place, x_data,
-                       sizeof(T) * count, stream);
-          out_data += count;
-        }
-        x_data += count;
-      }
-#else
-      PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-    }
-
-    out->set_lod(out_lod);
-    for (size_t i = 0; i < lod.size; i++) {
-      for (size_t j = 0; j < lod[i].size(); j++) {
-        LOG(INFO) << "lod[" << i << "][" << j "] = " << lod[i][j];
-      }
+    auto out_starts = out->lod().back();
+
+    for (size_t i = 0; i < out_starts.size() - 1; i++) {
+      int scale = out_starts[i + 1] - out_starts[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          x_t(x_data, 1, element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          out_t(out_data, scale, element_len);
+      Eigen::array<int, 2> cast({scale, 1});
+      out_t.device(place) = x_t.broadcast(cast);
+      x_data += element_len;
+      out_data += element_len * scale;
     }
   }
 };
@@ -130,25 +65,24 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Input<LoDTensor>("Out");
     auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_lod = out->lod();
-    auto out_abs_lod = out_lod.ToAbsOffset();
+    auto out_last_level = out->lod().back();
     d_x->set_lod(x->lod());
     const T* d_out_data = d_out->data<T>();
     auto d_out_dims = d_out->dims();
     T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
     size_t element_len = framework::product(d_out_dims) / d_out_dims[0];
-    for (size_t i = 0; i < out->NumElements(); ++i) {
-      size_t ele_count = out_abs_lod[0][i + 1] - out_abs_lod[0][i];
-      size_t repeat = out->NumElements(0, i);
-      Eigen::TensorMap<Eigen::Tensor<const T, 2>> d_out_t(
-          d_out_data, static_cast<int>(repeat),
-          static_cast<int>((ele_count * element_len) / repeat));
-      Eigen::TensorMap<Eigen::Tensor<T, 1>> d_x_t(
-          d_x_data, static_cast<int>((ele_count * element_len) / repeat));
+
+    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
+      size_t repeat = out_last_level[i + 1] - out_last_level[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_x_t(d_x_data, static_cast<int>(element_len));
       auto place = context.GetEigenDevice<Place>();
       d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (ele_count * element_len);
-      d_x_data += ((ele_count * element_len) / repeat);
+      d_out_data += (repeat * element_len);
+      d_x_data += element_len;
     }
   }
 };
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
@@ -246,8 +246,6 @@ def check_output_with_place(self, place, atol):
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
-                print "actual= %s" % actual
-                print "expect = %s" % expect
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=atol),
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/framework/tests/test_seq_expand.py