PaddlePaddle
diff --git a/‎paddle/fluid/API.spec
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/API.spec
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/sequence_expand_as_op.cc
Lines changed: 168 additions & 0 deletions b/‎paddle/fluid/operators/sequence_expand_as_op.cc
Lines changed: 168 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/sequence_expand_as_op.cu
Lines changed: 134 additions & 0 deletions b/‎paddle/fluid/operators/sequence_expand_as_op.cu
Lines changed: 134 additions & 0 deletions
@@ -116,6 +116,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 
@@ -0,0 +1,168 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_expand_as_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+
+class SequenceExpandAsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceExpandAsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SequenceExpandAsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceExpandAsOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = x_dims;
+
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Dimension number of Input(X) should be at least 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+
+      auto& x_dim = x_var->Get<LoDTensor>().dims();
+      auto& y_lod = y_var->Get<LoDTensor>().lod();
+
+      PADDLE_ENFORCE_EQ(y_lod.size(), 1,
+                        "Level number of Input(Y)'s lod should be 1.");
+
+      PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dim[0]), y_lod[0].size() - 1,
+                        "The first dimension of Input(X) should be equal "
+                        "to the size of Input(Y)'s 0 level lod.");
+
+      int64_t out_first_dim = 0;
+      if (y_lod[0].size() <= 1) {
+        out_first_dim = x_dims[0];
+      } else {
+        for (size_t i = 1; i < y_lod[0].size(); ++i) {
+          out_first_dim += (y_lod[0][i] - y_lod[0][i - 1]);
+        }
+      }
+      out_dims[0] = out_first_dim;
+    } else {
+      out_dims[0] = -1;
+    }
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("Y", /*->*/ "Out");
+  }
+};
+
+class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
+             "level is at most 1.");
+    AddInput("Y",
+             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
+             "lod (specified level) is referred by Input(X).");
+    AddOutput("Out",
+              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
+              "generated from Input(X) by referring lod of Input(Y).");
+    AddComment(R"DOC(
+Sequence Expand As Operator.
+
+This operator expands `X` according to the zeroth level lod of `Y`. Current
+implementation requires the level number of Input(Y)'s lod should be 1, and
+the first dimension of Input(X) should be equal to the size of Input(Y)'s zeroth
+level lod, and lod of Input(X) is not considered.
+
+Following are cases to better explain how this works:
+
+Case 1:
+
+Given a 1-level LoDTensor input(X)
+    X.data = [[a], [b], [c], [d]]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0, 3, 6, 7, 8]]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,            3,              6,  7,  8]]
+    Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
+    Out.dims = [8, 1]
+
+Case 2:
+
+Given a common Tensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+ref_level: 0
+then we get a common LoDTensor
+    Out.lod =  [[0,             2,     3,                    6]]
+    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+
+)DOC");
+  }
+};
+
+class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", x_grad_name);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp,
+                  ops::SequenceExpandAsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_as,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_as_grad,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>);
@@ -0,0 +1,134 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/operators/sequence_expand_as_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+static __global__ void sequence_expand_as_kernel(const T *in_data,
+                                                 const size_t *expand_offset,
+                                                 const size_t src_hight,
+                                                 const size_t src_widht,
+                                                 T *out_data) {
+  for (int h_id = blockIdx.x; h_id < src_hight; h_id += gridDim.x) {
+    int span = expand_offset[h_id + 1] - expand_offset[h_id];
+    if (span == 0) continue;
+    const T *src = in_data + h_id * src_widht;
+    for (int w_id = threadIdx.x; w_id < src_widht; w_id += blockDim.x) {
+      T ele = src[w_id];
+      int offset = expand_offset[h_id] * src_widht;
+      for (int k = 0; k < span; ++k) {
+        out_data[offset + k * src_widht + w_id] = ele;
+      }
+    }
+  }
+}
+
+template <typename T>
+static __global__ void sequence_expand_as_grad_kernel(
+    const T *dout_data, const size_t *expand_offset, const size_t dst_hight,
+    const size_t dst_width, T *dx_data) {
+  for (int h_id = blockIdx.x; h_id < dst_hight; h_id += gridDim.x) {
+    T *dst = dx_data + h_id * dst_width;
+    int span = expand_offset[h_id + 1] - expand_offset[h_id];
+
+    for (int w_id = threadIdx.x; w_id < dst_width; w_id += blockDim.x) {
+      T result = 0;
+      for (int k = 0; k < span; ++k) {
+        int offset = (expand_offset[h_id] + k) * dst_width;
+        const T *src = dout_data + offset;
+        result += src[w_id];
+      }
+      dst[w_id] = result;
+    }
+  }
+}
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(
+      const platform::CUDADeviceContext &context, const LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      LoDTensor *out) {
+    int hight = x.dims()[0];
+    int width = framework::product(x.dims()) / hight;
+
+    const int kThreadsPerBlock = 1024;
+    int thread_x = kThreadsPerBlock;
+    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      thread_x = ((width + 31) >> 5) << 5;
+    }
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int block_x = std::max(max_threads / thread_x, 1);
+
+    dim3 block_size(thread_x);
+    dim3 grid_size(block_x);
+    sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
+        out->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+template <typename T>
+struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext &context,
+                  const LoDTensor &dout,
+                  const framework::Vector<size_t> &ref_lod, /*expand based lod*/
+                  LoDTensor *dx) {
+    int hight = dx->dims()[0];
+    int width = framework::product(dx->dims()) / hight;
+
+    const int kThreadsPerBlock = 1024;
+    int thread_x = kThreadsPerBlock;
+    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      thread_x = ((width + 31) >> 5) << 5;
+    }
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int block_x = std::max(max_threads / thread_x, 1);
+
+    dim3 block_size(thread_x);
+    dim3 grid_size(block_x);
+    sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
+                                     context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
+        dx->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_as,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_as_grad,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>);