PaddlePaddle
diff --git a/‎paddle/fluid/framework/array.h
Lines changed: 48 additions & 0 deletions b/‎paddle/fluid/framework/array.h
Lines changed: 48 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/stack_op.cc
Lines changed: 66 additions & 0 deletions b/‎paddle/fluid/operators/stack_op.cc
Lines changed: 66 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/stack_op.cu
Lines changed: 109 additions & 0 deletions b/‎paddle/fluid/operators/stack_op.cu
Lines changed: 109 additions & 0 deletions
@@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+template <typename T, size_t N>
+class Array {
+  static_assert(N > 0, "The size of array must be larger than 0");
+
+ public:
+  HOSTDEVICE Array() {}
+
+  HOSTDEVICE explicit Array(const T &val) {
+    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  }
+
+  HOSTDEVICE const T *Get() const { return data_; }
+
+  HOSTDEVICE T *GetMutable() { return data_; }
+
+  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+
+  HOSTDEVICE constexpr size_t size() const { return N; }
+
+ private:
+  T data_[N];
+};
+
+}  // namespace framework
+}  // namespace paddle
@@ -0,0 +1,66 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct CPUStackFunctor {
+  template <typename DeviceContext, typename T>
+  void operator()(const DeviceContext& ctx, const std::vector<const T*>& x,
+                  T* y, int pre, int n, int post) const {
+    int total_num = pre * post * n;
+    for (int idx = 0; idx < total_num; ++idx) {
+      int i = idx / (n * post);
+      int which_x = idx / post - i * n;
+      int x_index = i * post + idx % post;
+      y[idx] = x[which_x][x_index];
+    }
+  }
+};
+
+struct CPUStackGradFunctor {
+  template <typename DeviceContext, typename T>
+  void operator()(const DeviceContext& ctx, std::vector<T*>& dx,  // NOLINT
+                  const T* dy, int pre, int n, int post) const {
+    int total_num = pre * post * n;
+    for (int idx = 0; idx < total_num; ++idx) {
+      int i = idx / (n * post);
+      int which_x = idx / post - i * n;
+      int x_index = i * post + idx % post;
+      dx[which_x][x_index] = dy[idx];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
+                  ops::StackGradOpDescMaker);
+REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    stack,
+    ops::StackKernel<plat::CPUDeviceContext, float, ops::CPUStackFunctor>,
+    ops::StackKernel<plat::CPUDeviceContext, double, ops::CPUStackFunctor>);
+
+REGISTER_OP_CPU_KERNEL(stack_grad,
+                       ops::StackGradKernel<plat::CPUDeviceContext, float,
+                                            ops::CPUStackGradFunctor>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, double,
+                                            ops::CPUStackGradFunctor>);
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include "paddle/fluid/framework/array.h"
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename VecXType>
+__global__ void StackCUDAKernel(VecXType x, T* y, int total_num, int n,
+                                int post) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < total_num) {
+    int i = idx / (n * post);
+    int which_x = idx / post - i * n;
+    int x_index = i * post + idx % post;
+    y[idx] = x[which_x][x_index];
+  }
+}
+
+template <typename T, typename VecDxType>
+__global__ void StackGradCUDAKernel(VecDxType dx, const T* dy, int total_num,
+                                    int n, int post) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < total_num) {
+    int i = idx / (n * post);
+    int which_x = idx / post - i * n;
+    int x_index = i * post + idx % post;
+    dx[which_x][x_index] = dy[idx];
+  }
+}
+
+struct GPUStackFunctor {
+  template <typename DeviceContext, typename T>
+  void operator()(const DeviceContext& ctx, const std::vector<const T*>& x,
+                  T* y, int pre, int n, int post) const {
+    int total_num = pre * post * n;
+    int threads = 512;
+    int grid = (total_num + threads - 1) / threads;
+
+    constexpr auto kMaxThreshold = 16;
+    if (n <= kMaxThreshold) {
+      framework::Array<const T*, kMaxThreshold> arr;
+      for (int i = 0; i < n; ++i) arr[i] = x[i];
+      StackCUDAKernel<<<grid, threads, 0, ctx.stream()>>>(arr, y, total_num, n,
+                                                          post);
+    } else {
+      VLOG(10) << "Stack more than " << kMaxThreshold
+               << " tensors may be slow on GPU.";
+      thrust::device_vector<const T*> dev_x(x);
+      StackCUDAKernel<<<grid, threads, 0, ctx.stream()>>>(dev_x.data().get(), y,
+                                                          total_num, n, post);
+    }
+  }
+};
+
+struct GPUStackGradFunctor {
+  template <typename DeviceContext, typename T>
+  void operator()(const DeviceContext& ctx, std::vector<T*>& dx,  // NOLINT
+                  const T* dy, int pre, int n, int post) const {
+    int total_num = pre * post * n;
+    int threads = 512;
+    int grid = (total_num + threads - 1) / threads;
+
+    constexpr auto kMaxThreshold = 16;
+    if (n <= kMaxThreshold) {
+      framework::Array<T*, kMaxThreshold> arr;
+      for (int i = 0; i < n; ++i) arr[i] = dx[i];
+      StackGradCUDAKernel<<<grid, threads, 0, ctx.stream()>>>(
+          arr, dy, total_num, n, post);
+    } else {
+      VLOG(10) << "Stack more than " << kMaxThreshold
+               << " tensors may be slow on GPU.";
+      thrust::device_vector<T*> dev_dx(dx);
+      StackGradCUDAKernel<<<grid, threads, 0, ctx.stream()>>>(
+          dev_dx.data().get(), dy, total_num, n, post);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    stack,
+    ops::StackKernel<plat::CUDADeviceContext, float, ops::GPUStackFunctor>,
+    ops::StackKernel<plat::CUDADeviceContext, double, ops::GPUStackFunctor>);
+
+REGISTER_OP_CUDA_KERNEL(stack_grad,
+                        ops::StackGradKernel<plat::CUDADeviceContext, float,
+                                             ops::GPUStackGradFunctor>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, double,
+                                             ops::GPUStackGradFunctor>);