PaddlePaddle
diff --git a/‎cmake/anakin_subgraph.cmake
Lines changed: 2 additions & 1 deletion b/‎cmake/anakin_subgraph.cmake
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/fc_fuse_pass.cc
Lines changed: 20 additions & 0 deletions b/‎paddle/fluid/framework/ir/fc_fuse_pass.cc
Lines changed: 20 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/ir/graph_pattern_detector.cc
Lines changed: 12 additions & 13 deletions b/‎paddle/fluid/framework/ir/graph_pattern_detector.cc
Lines changed: 12 additions & 13 deletions
diff --git a/‎paddle/fluid/framework/ir/graph_pattern_detector.h
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/framework/ir/graph_pattern_detector.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
Lines changed: 19 additions & 9 deletions b/‎paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
Lines changed: 19 additions & 9 deletions
diff --git a/‎paddle/fluid/inference/anakin/convert/CMakeLists.txt
Lines changed: 7 additions & 2 deletions b/‎paddle/fluid/inference/anakin/convert/CMakeLists.txt
Lines changed: 7 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/anakin/convert/activation.cc
Lines changed: 47 additions & 14 deletions b/‎paddle/fluid/inference/anakin/convert/activation.cc
Lines changed: 47 additions & 14 deletions
diff --git a/‎paddle/fluid/inference/anakin/convert/activation.h
Lines changed: 9 additions & 5 deletions b/‎paddle/fluid/inference/anakin/convert/activation.h
Lines changed: 9 additions & 5 deletions
diff --git a/‎paddle/fluid/inference/anakin/convert/affine_channel.cc
Lines changed: 73 additions & 0 deletions b/‎paddle/fluid/inference/anakin/convert/affine_channel.cc
Lines changed: 73 additions & 0 deletions
@@ -25,8 +25,9 @@ endif()
 
 if(ANAKIN_FOUND)
     message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT})
     include_directories(${ANAKIN_ROOT}/include)
-    include_directories(${ANAKIN_ROOT}/include/saber)
+    include_directories(${ANAKIN_ROOT}/saber)
     link_directories(${ANAKIN_ROOT})
     add_definitions(-DPADDLE_WITH_ANAKIN)
 endif()
@@ -48,17 +48,37 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
 
+    auto base_op_desc = mul->Op();
     // Create an FC Node.
+    // OpDesc desc(base_op_desc, nullptr);
     OpDesc desc;
     std::string fc_x_in = subgraph.at(x)->Name();
     std::string fc_Y_in = w->Name();
     std::string fc_bias_in = fc_bias->Name();
     std::string fc_out_out = fc_out->Name();
+
     desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
     desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
     desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
     desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
     desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
+
+    // For anakin subgraph int8
+    // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul +
+    // fake_dequant"
+    // can be detected by the quant_dequant_fuse_pass. This pass will add
+    // "input_scale",
+    // "weight_scale" which are extracted from fake_quant op and fake_dequant op
+    // to mul op,
+    // and then delete the fake_quant op and fake_dequant op in the graph. If
+    // the mul op
+    // has the scale info, we should add those to the fused fc.
+    if (base_op_desc->HasAttr("enable_int8")) {
+      desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8"));
+      desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale"));
+      desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale"));
+    }
+
     desc.SetType("fc");
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
     GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out});
 
@@ -1640,32 +1640,31 @@ PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
 void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
                                               const std::string &op_type,
                                               const std::string &weight_name,
-                                              int times) {
+                                              int times,
+                                              const std::string &quant_type) {
   const int kNumFields = 5;
   const int kQuantizedWeightOffset = 0;
   const int kQuantizedOpOffset = 1;
   const int kQuantizedOpOutOffset = 2;
   const int kDequantOpOffset = 3;
   const int kDequantOpOutOffset = 4;
   // the quant op always be one.
-  auto quant_op_in_scale =
-      pattern->NewNode(GetNodeName("quant_op_in_scale"))
-          ->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
-          ->AsInput();
-  auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
-                      ->assert_is_op("fake_quantize_range_abs_max");
+  auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
+                               ->assert_is_op_input(quant_type, "InScale")
+                               ->AsInput();
+  auto quant_op =
+      pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
 
   auto quant_op_out_scale =
       pattern->NewNode(GetNodeName("quant_op_out_scale"))
-          ->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
+          ->assert_is_op_output(quant_type, "OutScale")
           ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
           ->AsIntermediate();
 
-  auto quant_op_out =
-      pattern->NewNode(GetNodeName("quant_op_out"))
-          ->assert_is_op_output("fake_quantize_range_abs_max", "Out")
-          ->assert_is_op_input(op_type)
-          ->AsIntermediate();
+  auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
+                          ->assert_is_op_output(quant_type, "Out")
+                          ->assert_is_op_input(op_type)
+                          ->AsIntermediate();
 
   // there are 'times' quantized and dequant op
   std::vector<PDNode *> nodes;
 
@@ -880,7 +880,8 @@ struct QuantDequantOpFuse : public PatternBase {
       : PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
 
   void operator()(PDNode* quant_op_input, const std::string& op_name,
-                  const std::string& weight_name, int times = 1);
+                  const std::string& weight_name, int times,
+                  const std::string& quant_type);
 
   std::string GetNodeName(const std::string& op_type) {
     return PDNodeName(name_scope_, repr_, id_, op_type);
 
@@ -25,7 +25,8 @@ namespace framework {
 namespace ir {
 
 void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
-                     std::string op_type) {
+                     const std::string& op_type,
+                     const std::string& quant_type) {
   const std::string pattern_name = "quant_dequant_fuse";
   //  FusePassBase::Init(pattern_name, graph);
   const int kNumFields = 5;
@@ -38,14 +39,17 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode("x")
-                ->assert_is_op_input("fake_quantize_range_abs_max", "X")
+                ->assert_is_op_input(quant_type, "X")
                 ->AsInput();
 
   std::string quantized_op_type = "";
   std::string weight_name = "";
   if (op_type == "conv2d") {
     quantized_op_type = "conv2d";
     weight_name = "Filter";
+  } else if (op_type == "depthwise_conv2d") {
+    quantized_op_type = "depthwise_conv2d";
+    weight_name = "Filter";
   } else if (op_type == "conv2d_fusion") {
     quantized_op_type = "conv2d_fusion";
     weight_name = "Filter";
@@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
   }
 
   patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x, quantized_op_type, weight_name, times);
+  pattern(x, quantized_op_type, weight_name, times, quant_type);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -103,7 +107,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
     std::unordered_set<const Node*> delete_nodes;
 
     for (int i = 0; i < times; i++) {
-      // max_range = (range * range) / weight_scale
       float max_range = boost::get<float>(
           nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
       float weight_scale = (range * range) / max_range;
@@ -118,7 +121,8 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
       new_op_desc.SetType(quantized_op_type);
 
       if (quantized_op_type == "conv2d" ||
-          quantized_op_type == "conv2d_fusion") {
+          quantized_op_type == "conv2d_fusion" ||
+          quantized_op_type == "depthwise_conv2d") {
         new_op_desc.SetInput("Input", {new_input});
         new_op_desc.SetOutput("Output", {new_output});
       } else if (quantized_op_type == "fc") {
@@ -156,11 +160,17 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "quant_dequant_fuse";
   FusePassBase::Init(pattern_name, graph);
 
-  std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
+  std::unordered_set<std::string> quant_types = {
+      "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
+
+  std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
+                                                        "depthwise_conv2d"};
   auto* scope = param_scope();
-  for (auto& op_type : quantized_op_types) {
-    for (int i = 1; i <= 6; i++) {
-      RunQuantDequant(graph, scope, i, op_type);
+  for (auto& quant_type : quant_types) {
+    for (auto& op_type : quantized_op_types) {
+      for (int i = 6; i >= 1; i--) {
+        RunQuantDequant(graph, scope, i, op_type, quant_type);
+      }
     }
   }
 }
 
@@ -1,4 +1,9 @@
-cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
+elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc
+batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
+detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc
+roi_align.cc helper.cc DEPS anakin_engine framework_proto scope op_registry
+gtest)
 
 cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
 cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
@@ -14,5 +19,5 @@ cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter fla
 cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
 cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
 cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
-#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
 cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor SERIAL)
+cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op SERIAL)
@@ -16,41 +16,74 @@
 #include <algorithm>
 #include <map>
 
-using anakin::graph::GraphGlobalMem;
-using anakin::AK_FLOAT;
-using anakin::saber::NV;
-using anakin::saber::Shape;
-
 namespace paddle {
 namespace inference {
 namespace anakin {
 
-ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
+template <typename TargetT, ::anakin::Precision PrecisionT>
+ActivationOpConverter<TargetT, PrecisionT>::ActivationOpConverter(
+    const std::string &op_type)
     : op_type_(op_type) {
   auto it = anakin_op_types_.find(op_type_);
   PADDLE_ENFORCE(it != anakin_op_types_.end(),
                  "activation op type is not support");
   anakin_op_type_ = it->second;
 }
 
-void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
-                                       const framework::BlockDesc &block_desc,
-                                       const framework::Scope &scope,
-                                       bool test_mode) {
+template <typename TargetT, ::anakin::Precision PrecisionT>
+void ActivationOpConverter<TargetT, PrecisionT>::operator()(
+    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
+    const framework::Scope &scope, bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
   PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
   PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
 
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
   auto input_name = op_desc.Input("X").front();
   auto output_name = op_desc.Output("Out").front();
-  engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
-  engine_->AddOpAttr(op_name, "type", anakin_op_type_);
+  this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
+  this->engine_->AddOpAttr(op_name, "type", anakin_op_type_);
 }
 
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
-REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
+#ifdef PADDLE_WITH_CUDA
+using sigmoid_nv_fp32 =
+    ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
+                                                    ::anakin::Precision::FP32>;
+using sigmoid_nv_int8 =
+    ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
+                                                    ::anakin::Precision::INT8>;
+using tanh_nv_fp32 =
+    ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
+                                                 ::anakin::Precision::FP32>;
+using tanh_nv_int8 =
+    ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
+                                                 ::anakin::Precision::INT8>;
+
+REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_fp32);
+REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_int8);
+REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, tanh_nv_fp32);
+REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_nv_int8);
+#endif
+
+using sigmoid_cpu_fp32 =
+    ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
+                                                    ::anakin::Precision::FP32>;
+using sigmoid_cpu_int8 =
+    ::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
+                                                    ::anakin::Precision::INT8>;
+using tanh_cpu_fp32 =
+    ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
+                                                 ::anakin::Precision::FP32>;
+using tanh_cpu_int8 =
+    ::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
+                                                 ::anakin::Precision::INT8>;
+
+REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_fp32);
+REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_int8);
+
+REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_fp32);
+REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_int8);
@@ -22,7 +22,8 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-class ActivationOpConverter : public AnakinOpConverter {
+template <typename TargetT, ::anakin::Precision PrecisionT>
+class ActivationOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
  public:
   explicit ActivationOpConverter(const std::string &op_type);
 
@@ -39,14 +40,17 @@ class ActivationOpConverter : public AnakinOpConverter {
                                                       {"sigmoid", "Sigmoid"}};
 };
 
-class TanhOpConverter : public ActivationOpConverter {
+template <typename TargetT, ::anakin::Precision PrecisionT>
+class TanhOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
  public:
-  TanhOpConverter() : ActivationOpConverter("tanh") {}
+  TanhOpConverter() : ActivationOpConverter<TargetT, PrecisionT>("tanh") {}
 };
 
-class SigmoidOpConverter : public ActivationOpConverter {
+template <typename TargetT, ::anakin::Precision PrecisionT>
+class SigmoidOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
  public:
-  SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
+  SigmoidOpConverter()
+      : ActivationOpConverter<TargetT, PrecisionT>("sigmoid") {}
 };
 }  // namespace anakin
 }  // namespace inference
 
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/affine_channel.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/anakin/convert/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+template <typename TargetT, ::anakin::Precision PrecisionT>
+void AffineChannelOpConverter<TargetT, PrecisionT>::operator()(
+    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
+    const framework::Scope &scope, bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
+
+  // Copy the Scale to CPUPlace and get the pointer.
+  auto *scale_v = scope.FindVar(op_desc.Input("Scale").front());
+  PADDLE_ENFORCE_NOT_NULL(scale_v);
+  auto weight1 = pblock_from_var<TargetT, PrecisionT>(*scale_v, this->engine_);
+  this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
+
+  // Copy the Bias to CPUPlace and get the pointer.
+  auto *bias_v = scope.FindVar(op_desc.Input("Bias").front());
+  PADDLE_ENFORCE_NOT_NULL(bias_v);
+  auto weight2 = pblock_from_var<TargetT, PrecisionT>(*bias_v, this->engine_);
+  this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+#ifdef PADDLE_WITH_CUDA
+using affine_channel_nv_fp32 =
+    ::paddle::inference::anakin::AffineChannelOpConverter<
+        ::anakin::saber::NV, ::anakin::Precision::FP32>;
+using affine_channel_nv_int8 =
+    ::paddle::inference::anakin::AffineChannelOpConverter<
+        ::anakin::saber::NV, ::anakin::Precision::INT8>;
+REGISTER_CUDA_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_fp32);
+REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_int8);
+#endif
+
+using affine_channel_cpu_fp32 =
+    ::paddle::inference::anakin::AffineChannelOpConverter<
+        ::anakin::saber::X86, ::anakin::Precision::FP32>;
+using affine_channel_cpu_int8 =
+    ::paddle::inference::anakin::AffineChannelOpConverter<
+        ::anakin::saber::X86, ::anakin::Precision::INT8>;
+REGISTER_CPU_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_fp32);
+REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_int8);