Merge pull request #15250 from tensor-tang/refine/seqpool/feed

tensor-tang · web-flow · commit 146e942c6550 · 2019-01-10T21:50:58.000+08:00
Refine/seqpool/feed with infer zerocopytensor
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -69,6 +69,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -39,21 +39,25 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
 
   auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
       Node* x, const std::string& type, int idx) -> bool {
-    bool ok = x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
-              x->Op()->HasAttr("pooltype") &&
-              boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
-              x->outputs.size() == 2;  // seqpool should only have 2 outputs
-    if (ok) {
-      // only one output of seqpool_op is nth_input_var of concat
-      // the other one should be unused empty var
+    bool this_is_seqpool_op =
+        x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
+        x->Op()->HasAttr("pooltype") &&
+        boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
+        x->outputs.size() == 2;  // seqpool should only have 2 outputs
+    bool satisfied_all = this_is_seqpool_op;
+    if (this_is_seqpool_op) {
+      // Only one output of seqpool_op is nth_input_var of concat,
+      // the other one should be unused empty var.
       if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
-        ok = ok && x->outputs[1]->IsVar() && x->outputs[1]->outputs.size() == 0;
+        satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
+                        x->outputs[1]->outputs.size() == 0;
       } else {
-        ok = ok && is_nth_input_var_of_concat(x->outputs[1], idx) &&
-             x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
+        satisfied_all =
+            satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&
+            x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
       }
     }
-    return ok;
+    return satisfied_all;
   };
 
   auto* concat_op = pattern->NewNode(
@@ -72,6 +76,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
 
   std::vector<PDNode*> seqpool_ops_input_var(num_inputs);
   std::vector<PDNode*> seqpool_ops_output_var(num_inputs);
+  std::vector<PDNode*> seqpool_ops_output_unused_var(num_inputs);
   std::vector<PDNode*> seqpool_ops(num_inputs);
 
   for (int i = 0; i < num_inputs; ++i) {
@@ -84,6 +89,15 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
         },
         name_scope + "/sequence_pool_out_" + std::to_string(i));
 
+    seqpool_ops_output_unused_var[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsVar() && x->inputs.size() == 1 &&
+                 x->outputs.size() == 0 &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
+                                                                   "SUM", i);
+        },
+        name_scope + "/sequence_pool_unused_out_" + std::to_string(i));
+
     seqpool_ops[i] = pattern->NewNode(
         [=](Node* x) {
           return x && x->IsOp() &&
@@ -93,23 +107,29 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
 
     seqpool_ops_input_var[i] = pattern->NewNode(
         [=](Node* x) {
-          return x && x->IsVar() && x->outputs.size() >= 1 &&
-                 is_seqpool_op_with_pootype_of_nth_input_of_concat(
-                     x->outputs[0], "SUM", i);
+          bool basic = x && x->IsVar() && x->outputs.size() >= 1;
+          bool next_is_fine = false;
+          for (auto* o : x->outputs) {
+            if (is_seqpool_op_with_pootype_of_nth_input_of_concat(o, "SUM",
+                                                                  i)) {
+              next_is_fine = true;
+              break;
+            }
+          }
+          return basic && next_is_fine;
         },
         name_scope + "/sequence_pool_in_" + std::to_string(i));
 
     // Links
     seqpool_ops[i]
         ->LinksFrom({seqpool_ops_input_var[i]})
-        .LinksTo({seqpool_ops_output_var[i]});
+        .LinksTo({seqpool_ops_output_var[i], seqpool_ops_output_unused_var[i]});
   }
   concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var});
   return concat_out_var;
 }
 
-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
-                int num_inputs) {
+int BuildFusion(Graph* graph, const std::string& name_scope, int num_inputs) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
   BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
@@ -178,8 +198,8 @@ std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl(
   FusePassBase::Init(name_scope_, graph.get());
   int fusion_count = 0;
   for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
-    fusion_count += BuildFusion(
-        graph.get(), name_scope_ + "/" + std::to_string(i), param_scope(), i);
+    fusion_count +=
+        BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i);
   }
   AddStatis(fusion_count);
 
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -23,6 +23,20 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+/**
+ * Fuse SequencePool(with sum pooltype yet) and Concat;
+ *
+ * Before fuse:
+ *    |         |             |
+ * seq_pool, seq_pool, ... seq_pool
+ *    \         |      ...   /
+ *            concat
+ *              |
+ * After fuse:
+ *    \      |       /
+ *   FusionSeqPoolConcat
+ *           |
+ */
 class SeqPoolConcatFusePass : public FusePassBase {
  public:
   virtual ~SeqPoolConcatFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "sequence_pool") {
+    op->SetInput("X", {inputs[0]});
+    std::string pooltype = "SUM";
+    op->SetAttr("pooltype", pooltype);
+    op->SetOutput("MaxIndex", {outputs[0]});
+    op->SetOutput("Out", {outputs[1]});
+  } else if (type == "concat") {
+    op->SetInput("X", inputs);
+    op->SetAttr("axis", 1);
+    op->SetOutput("Out", {outputs[0]});
+  } else {
+    op->SetInput("X", inputs);
+    op->SetOutput("Out", outputs);
+  }
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+
+int CountOpType(const ir::Graph* graph,
+                const std::string& op_type = "fusion_seqpool_concat") {
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == op_type) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
+    std::unique_ptr<ir::Graph> graph, int* before, int* after,
+    const std::string& pass_type = "seqpool_concat_fuse_pass") {
+  auto pass = PassRegistry::Instance().Get(pass_type);
+  *before = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  *after = graph->Nodes().size();
+  return graph;
+}
+
+/*
+ * Before fuse:
+ *    a         b         c
+ *    |         |         |
+ *   op1       op2       op3
+ *   / \       / \       / \
+ *  d  e      f   g     h   i
+ *      \         |        /
+ *            concat
+ *              |
+ *              j
+ * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr
+ *
+ * After fuse:
+ *    a         b         c
+ *    \         |        /
+ *    fusion_seqpool_concat
+ *              |
+ *              j
+ */
+TEST(SeqPoolConcatFusePass, basic) {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+  }
+
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"d", "e"}));
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"f", "g"}));
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"c"}),
+        std::vector<std::string>({"h", "i"}));
+  SetOp(&prog, "concat", std::vector<std::string>({"e", "g", "i"}),
+        std::vector<std::string>({"j"}));
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int before, after;
+  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
+  // Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
+  // Add 1 Node: fusion_seqpool_concat
+  EXPECT_EQ(after, before - 9);
+  EXPECT_EQ(CountOpType(graph.get()), 1);
+}
+
+/*
+ * Before fuse:
+ *    a            b
+ *    |           /  \
+ *   op1        op2  op3
+ *   / \        / \    \
+ *  c  d       e   f    g
+ *      \         /
+ *        concat
+ *          |
+ *          h
+ * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr
+ *
+ * After fuse:
+ *   a                         b
+ *    \                     /     \
+ *    fusion_seqpool_concat       op3
+ *              |                  |
+ *              h                  g
+ */
+TEST(SeqPoolConcatFusePass, advanced) {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+  }
+
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c", "d"}));
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"e", "f"}));
+  SetOp(&prog, "op3", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"g"}));
+  SetOp(&prog, "concat", std::vector<std::string>({"d", "f"}),
+        std::vector<std::string>({"h"}));
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int before, after;
+  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
+  // Remove 7 Nodes: op1, op2, c, d, e, f concat_op
+  // Add 1 Node: fusion_seqpool_concat
+  EXPECT_EQ(after, before - 6);
+  EXPECT_EQ(CountOpType(graph.get()), 1);
+}
+
+ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
+  ProgramDesc prog;
+  auto new_var = [&](const std::string& name) {
+    auto* var = prog.MutableBlock(0)->Var(name);
+    var->SetType(proto::VarType::LOD_TENSOR);
+  };
+  std::vector<std::string> concat_inputs;
+  for (int i = 0; i < num_inputs_of_concat; ++i) {
+    std::string prefix = "seqpool_op_" + i;
+    new_var(prefix + "in");
+    new_var(prefix + "out");
+    new_var(prefix + "out_unused");
+    SetOp(&prog, "sequence_pool", std::vector<std::string>({prefix + "in"}),
+          std::vector<std::string>({prefix + "out", prefix + "out_unused"}));
+    concat_inputs.push_back(prefix + "out");
+  }
+  SetOp(&prog, "concat", concat_inputs,
+        std::vector<std::string>({"concat_out"}));
+  return prog;
+}
+
+// test more inputs of concat
+TEST(SeqPoolConcatFusePass, more_inputs) {
+  for (int num : {1, 2, 10}) {
+    ProgramDesc prog = BuildProgramDesc(num);
+    std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+    int before, after;
+    graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
+    // Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
+    // Add Node: fusion_seqpool_concat op
+    EXPECT_EQ(after, before - num * 3);
+    EXPECT_EQ(CountOpType(graph.get()), 1);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(seqpool_concat_fuse_pass);
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
     os << to_string(l) << "; ";
   }
   os << "\n";
-  os << " - data: ";
+  os << " - memory length: " << tensor.data.length();
+  os << "\n";
 
+  os << " - data: ";
   int dim = VecReduceToInt(tensor.shape);
+  float *pdata = static_cast<float *>(tensor.data.data());
   for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
+    os << pdata[i] << " ";
   }
   os << '\n';
   return os.str();
@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
     os << to_string(l) << "; ";
   }
   os << "\n";
-  os << " - data: ";
   PaddlePlace place;
   int size;
   const auto *data = tensor.data<float>(&place, &size);
+  os << " - numel: " << size;
+  os << "\n";
+  os << " - data: ";
   for (int i = 0; i < size; i++) {
     os << data[i] << " ";
   }
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
@@ -123,7 +123,8 @@ class ZeroCopyTensor {
    */
   template <typename T>
   T* mutable_data(PaddlePlace place);
-  /** Get the memory directly, will return the place and memory size by pointer.
+  /** Get the memory directly, will return the place and element size by
+   * pointer.
    * This is for reading the output tensor.
    */
   template <typename T>
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {
   ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
   LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
 
-  int output_size{0};
+  int output_size{0};  // this is the number of elements not memory size
   auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
   auto *native_data = static_cast<float *>(native_outputs.front().data.data());
-  for (size_t i = 0; i < output_size / sizeof(float); i++) {
+  for (int i = 0; i < output_size; i++) {
     EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
   }
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc

Original file line number	Diff line number	Diff line change
`@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {`
`351`	`351`	`ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));`
`352`	`352`	`LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());`
`353`	`353`
`354`		`- int output_size{0};`
	`354`	`+ int output_size{0}; // this is the number of elements not memory size`
`355`	`355`	`auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);`
`356`	`356`	`auto native_data = static_cast<float >(native_outputs.front().data.data());`
`357`		`- for (size_t i = 0; i < output_size / sizeof(float); i++) {`
	`357`	`+ for (int i = 0; i < output_size; i++) {`
`358`	`358`	`EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);`
`359`	`359`	`}`
`360`	`360`	`}`