PaddlePaddle
diff --git a/‎paddle/fluid/operators/reduce_op.cc
Lines changed: 33 additions & 18 deletions b/‎paddle/fluid/operators/reduce_op.cc
Lines changed: 33 additions & 18 deletions
diff --git a/‎paddle/fluid/operators/reduce_op.h
Lines changed: 59 additions & 40 deletions b/‎paddle/fluid/operators/reduce_op.h
Lines changed: 59 additions & 40 deletions
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_op.h"
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
@@ -34,11 +35,14 @@ class ReduceOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
     PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
     bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
     bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
     if (reduce_all) {
@@ -49,14 +53,22 @@ class ReduceOp : public framework::OperatorWithKernel {
         ctx->SetOutputDim("Out", {1});
     } else {
       auto dims_vector = vectorize(x_dims);
-      if (keep_dim || x_rank == 1) {
-        dims_vector[dim] = 1;
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
       } else {
-        dims_vector.erase(dims_vector.begin() + dim);
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
       }
       auto out_dims = framework::make_ddim(dims_vector);
       ctx->SetOutputDim("Out", out_dims);
-      if (dim != 0) {
+      if (dims[0] != 0) {
         // Only pass LoD when not reducing on the first dim.
         ctx->ShareLoD("X", /*->*/ "Out");
       }
@@ -75,11 +87,14 @@ class ReduceGradOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
     PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
@@ -95,13 +110,13 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) The input tensor. Tensors with rank at most 6 are "
              "supported.");
     AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<int>(
+    AddAttr<std::vector<int>>(
         "dim",
-        "(int, default 0) The dimension to reduce. "
+        "(list<int>, default {0}) The dimensions to reduce. "
         "Must be in the range [-rank(input), rank(input)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
         "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
+        .SetDefault({0});
     AddAttr<bool>("keep_dim",
                   "(bool, default false) "
                   "If true, retain the reduced dimension with length 1.")
 
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -109,6 +110,11 @@ struct ProdGradFunctor {
   }
 };
 
+#define HANDLE_DIM(NDIM, RDIM)          \
+  if (ndim == NDIM && rdim == RDIM) {   \
+    ReduceCompute<NDIM, RDIM>(context); \
+  }
+
 template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
@@ -127,51 +133,56 @@ class ReduceKernel : public framework::OpKernel<T> {
       Functor functor;
       functor(place, &x, &out, reduce_dim);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceCompute<1>(context);
-          break;
-        case 2:
-          ReduceCompute<2>(context);
-          break;
-        case 3:
-          ReduceCompute<3>(context);
-          break;
-        case 4:
-          ReduceCompute<4>(context);
-          break;
-        case 5:
-          ReduceCompute<5>(context);
-          break;
-        case 6:
-          ReduceCompute<6>(context);
-          break;
-      }
+      int ndim = context.Input<Tensor>("X")->dims().size();
+      int rdim = context.Attr<std::vector<int>>("dim").size();
+      HANDLE_DIM(6, 5);
+      HANDLE_DIM(6, 4);
+      HANDLE_DIM(6, 3);
+      HANDLE_DIM(6, 2);
+      HANDLE_DIM(6, 1);
+      HANDLE_DIM(5, 4);
+      HANDLE_DIM(5, 3);
+      HANDLE_DIM(5, 2);
+      HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+      HANDLE_DIM(1, 1);
     }
   }
 
  private:
-  template <size_t D>
+  template <size_t D, size_t R_D>
   void ReduceCompute(const framework::ExecutionContext& context) const {
     auto* input = context.Input<Tensor>("X");
     auto* output = context.Output<Tensor>("Out");
     output->mutable_data<T>(context.GetPlace());
 
     auto x = EigenTensor<T, D>::From(*input);
     auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    auto reduce_dim = Eigen::array<int, 1>({{dim}});
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto reduce_dim = Eigen::array<int, R_D>();
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      reduce_dim[i] = dims[i];
+    }
     // construct the squeezed output tensor
     bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim dims = output->dims();
-    auto dims_vector = vectorize(dims);
+    DDim out_dims = output->dims();
     if (keep_dim && x_rank > 1) {
-      dims_vector.erase(dims_vector.begin() + dim);
-      dims = framework::make_ddim(dims_vector);
+      const int kDelFlag = -2;
+      auto dims_vector = vectorize(out_dims);
+      for (size_t i = 0; i < dims.size(); ++i) {
+        dims_vector[dims[i]] = kDelFlag;
+      }
+      dims_vector.erase(
+          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+          dims_vector.end());
+      out_dims = framework::make_ddim(dims_vector);
     }
-
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
@@ -180,7 +191,7 @@ class ReduceKernel : public framework::OpKernel<T> {
       auto out = EigenScalar<T>::From(*output);
       functor(place, &x, &out, reduce_dim);
     } else {
-      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
+      auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
       functor(place, &x, &out, reduce_dim);
     }
   }
@@ -245,21 +256,29 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto x = EigenTensor<T, D>::From(*input0);
     auto x_grad = EigenTensor<T, D>::From(*output);
     auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    DDim dims = input0->dims();
-    dims[dim] = 1;
-    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
-
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto x_dims = input0->dims();
+    auto reduced_dims_v = vectorize(x_dims);
     Eigen::array<int, D> broadcast_dim;
     for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-    broadcast_dim[dim] = input0->dims()[dim];
+
+    int broad_cats_times = 1;
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      reduced_dims_v[dims[i]] = 1;
+      broadcast_dim[dims[i]] = x_dims[dims[i]];
+      broad_cats_times *= x_dims[dims[i]];
+    }
+    auto reduced_dims = framework::make_ddim(reduced_dims_v);
+    auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims);
+    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);
+
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
+
     Functor functor;
     functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-            broadcast_dim[dim]);
+            broad_cats_times);
   }
 };