PaddlePaddle
diff --git a/‎paddle/fluid/operators/elementwise_op_function.h
Lines changed: 928 additions & 104 deletions b/‎paddle/fluid/operators/elementwise_op_function.h
Lines changed: 928 additions & 104 deletions
diff --git a/‎paddle/fluid/operators/fused_elemwise_activation_op.cc
Lines changed: 181 additions & 61 deletions b/‎paddle/fluid/operators/fused_elemwise_activation_op.cc
Lines changed: 181 additions & 61 deletions
@@ -12,14 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
-
 namespace paddle {
 namespace operators {
 
+/*
+ * Whether the compound function is Unary(Binary(X, Y)).
+ * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
+ * out.
+ */
+static bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
+  static std::unordered_set<std::string> binary_fun = {
+      "elementwise_add", "elementwise_mul", "elementwise_add_grad",
+      "elementwise_mul_grad"};
+  return binary_fun.count(functor_list[1]) != 0;
+}
+
+/*
+ * Whether the Input(X) could be absent.
+ */
+static bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(functor_list.size(), 2);
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"};
+  return binary_fun.count(functor_list[0]) != 0 ||
+         binary_fun.count(functor_list[1]) != 0;
+}
+
+/*
+ * Whether the compound function is supported.
+ * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
+ * out.
+ */
+static bool IsSupportedCompound(const std::vector<std::string> &functors) {
+  static std::unordered_set<std::string> unary_fun = {"scale", "relu"};
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
+                                                       "elementwise_mul"};
+
+  std::string unary_fun_str;
+  if (binary_fun.count(functors[0])) {
+    unary_fun_str = functors[1];
+  } else if (binary_fun.count(functors[1])) {
+    unary_fun_str = functors[0];
+  } else {
+    PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
+                 functors[1]);
+  }
+  PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
+                    "%s is not included in fused_list.", unary_fun_str);
+  return true;
+}
+
 class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -37,11 +83,44 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
 
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.");
 
-    ctx->SetOutputDim("Out", x_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
+    // Whether the shape of Y is a continuous subsequence of X,
+    // For more information please refer to the op's introduction.
+    bool bcast_y = x_dim.size() >= y_dim.size();
+    if (x_dim.size() == y_dim.size()) {
+      for (int i = 0; i < x_dim.size(); ++i) {
+        if (x_dim[i] < y_dim[i]) {
+          bcast_y = false;
+          break;
+        }
+      }
+    }
+
+    auto &out_dim = bcast_y ? x_dim : y_dim;
+    std::string out_lod = bcast_y ? "X" : "Y";
+
+    if (ctx->Attrs().Get<bool>("keep_intermediate_value")) {
+      PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"),
+                     "Output(IntermediateOut) of FusedElemwiseActivationOp "
+                     "should not be null.");
+
+      if (IsUnaryCompound(
+              ctx->Attrs().Get<std::vector<std::string>>("functor_list"))) {
+        // for Unary(Binary(X, Y)), the shape and lod of out and
+        // intermediate_out are the same.
+        ctx->SetOutputDim("IntermediateOut", out_dim);
+        // set the lod of intermediate_out
+        ctx->ShareLoD(out_lod, /*->*/ "IntermediateOut");
+      } else {
+        // for Binary(X, Unary(Y)), the shape and lod of Y and
+        // intermediate_out are the same.
+        ctx->SetOutputDim("IntermediateOut", y_dim);
+        // set the lod of intermediate_out
+        ctx->ShareLoD("Y", /*->*/ "IntermediateOut");
+      }
+    }
+    ctx->SetOutputDim("Out", out_dim);
+    ctx->ShareLoD(out_lod, /*->*/ "Out");
   }
 
  protected:
@@ -59,29 +138,42 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
 class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(vector<Tensor>)");
-    AddInput("Y", "(vector<Tensor>)");
-    AddOutput("Out", "vector<Tensor>");
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of fused_elemwise_activation operator.");
+    AddInput(
+        "Y",
+        "(Tensor) The input tensor of fused_elemwise_activation operator.");
+    AddOutput("Out",
+              "vector<Tensor> The output tensor of fused_elemwise_activation "
+              "operator.");
+    AddOutput("IntermediateOut",
+              "Tensor The IntermediateOut tensor of fused_elemwise_activation "
+              "operator.")
+        .AsIntermediate();
     AddAttr<int>("axis",
                  "axis is used by elementwise_op, the default value is -1.")
         .SetDefault(-1);
     AddAttr<float>("scale",
                    "scale is used by scale_op, the default value is 0.0.")
         .SetDefault(0.0);
-    AddAttr<bool>("recomputation",
-                  "Whether to recompute the Out."
-                  "fused_elemwise_activation_grad has two methods to get the "
-                  "dx and dy, one "
-                  "is to use the 'Out', and the other is not to use it. "
-                  "The former method will save the time of recomputing the "
-                  "'Out', but it must occupy the memory to store the 'out'. "
-                  "While, the later method can avoid occupying the memory, "
-                  "but it must recompute the 'Out'. The default value is true.")
+    AddAttr<bool>(
+        "recomputation",
+        "Whether to recompute the Out."
+        "The computation of fused_elemwise_activation_grad has two methods to "
+        "get the dx and dy, one is to use the 'Out', and the other is not. "
+        "The former method will save the time of recomputing the 'Out', but it "
+        "must occupy the memory to store the 'out'. While, the later method "
+        "can avoid occupying the memory, but it must recompute the 'Out'. "
+        "It is useful for Unary(Binary(X, Y)). The default value is true.")
         .SetDefault(true);
+    AddAttr<bool>("keep_intermediate_value",
+                  "Whether to save the intermediate_out.")
+        .SetDefault(false);
     AddAttr<std::vector<std::string>>("functor_list",
                                       "The functors that should be fused.")
         .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
-          PADDLE_ENFORCE(ValidCheck(functor_list));
+          PADDLE_ENFORCE(IsSupportedCompound(functor_list));
         });
 
     AddComment(R"DOC(
@@ -93,30 +185,38 @@ operators (elementwise_op and activation_op):
     Z = Binary(X, Unary(Y))
     Z = Unary(Binary(X, Y))
 
-The attributions of activation_op can be get from fused_elemwise_activation_op's
-attributions. functor_list records the functors to be fused, for example
-"scale,elementwise_add".
+There are two cases for this operator:
 
-)DOC");
-  }
+1. The shape of $Y$ and $X$ is the same.
+2. The shape of $Y$ is a continuous subsequence of $X$ or the shape of $X$ is a continuous subsequence of $Y$.
 
- private:
-  bool ValidCheck(const std::vector<std::string> &functors) {
-    std::unordered_set<std::string> unary_fun = {"scale", "relu"};
-    std::unordered_set<std::string> binary_fun = {"elementwise_add"};
+For case 2 (assume that the shape of $Y$ is a continuous subsequence of $X$ ):
 
-    std::string unary_fun_str;
-    if (binary_fun.count(functors[0])) {
-      unary_fun_str = functors[1];
-    } else if (binary_fun.count(functors[1])) {
-      unary_fun_str = functors[0];
-    } else {
-      PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
-                   functors[1]);
-    }
-    PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
-                      "%s is not included in fused_list.", unary_fun_str);
-    return true;
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
+   for broadcasting $Y$ onto $X$.
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
+   subsequence, such as shape(Y) = (2, 1) => (2).
+
+For example:
+
+  .. code-block:: python
+
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
+
+
+The inputs $X$ and $Y$ can carry the different LoD information.
+But the output only shares the LoD information with the one whose shape is the same with Out.
+The attributions of activation_op can be get from fused_elemwise_activation_op's.
+The functor_list records the functions to be fused, for example
+["scale", "elementwise_add"].
+
+)DOC");
   }
 };
 
@@ -141,6 +241,7 @@ class FusedElemwiseActivationGradMaker
       op_desc_ptr->SetInput(framework::GradVarName(output_param),
                             this->OutputGrad(output_param));
     }
+
     op_desc_ptr->SetAttrMap(this->Attrs());
 
     std::vector<std::string> functor_names =
@@ -158,40 +259,59 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.");
+                   "Input(Out@Grad) should not be null");
+    if (ctx->Attrs().Get<bool>("keep_intermediate_value")) {
+      PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"),
+                     "Input(IntermediateOut) should not be null");
+    } else {
+      PADDLE_ENFORCE_EQ(ctx->Inputs(framework::GradVarName("Out")).size(), 1);
+    }
 
+    auto funtor_list =
+        ctx->Attrs().Get<std::vector<std::string>>("functor_list");
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
+
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      if (ctx->HasInputs("X")) {
+        ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+        ctx->ShareLoD("X", x_grad_name);
+      } else {
+        // Node: If "X" is absence, the shape of Y should be a continuous
+        // subsequence of X, if not, we could not infer the shape of dx.
+
+        // Currently, only when Binary is elementwise_add or elementwise_sub,
+        // the "X" could be absent.
+        PADDLE_ENFORCE(InputXCanBeAbsent(funtor_list),
+                       "Only when BinaryFunctor is elementwise_add, the 'X' "
+                       "could be absent.");
+
+        // For Unary(Binary(X, Y)), IntermediateOut should not be empty.
+        if (IsUnaryCompound(funtor_list)) {
+          PADDLE_ENFORCE(
+              ctx->HasInputs("IntermediateOut"),
+              "If the compound_functor is Unary(Binary(X, Y)) and Binary "
+              "is elementwise_add, the intermediate_out must be not absent.");
+        }
+
+        ctx->SetOutputDim(x_grad_name,
+                          ctx->GetInputDim(framework::GradVarName("Out")));
+        ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name);
+      }
     }
     if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y"));
+      ctx->ShareLoD("Y", y_grad_name);
     }
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
-    PADDLE_ENFORCE_EQ(input_data_type_index,
-                      ctx.Input<framework::Tensor>("Y")->type(),
-                      "The element's type of input should be the same.");
-    PADDLE_ENFORCE_EQ(
-        input_data_type_index,
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
-        "The element's type of input should be the same.");
-
+    //    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    auto input_data_type_index = ctx.Input<framework::Tensor>("Y")->type();
     auto input_data_type = framework::ToDataType(input_data_type_index);
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }