Skip to content

Commit 9f1927b

Browse files
authored
Merge pull request #16814 from NHZlX/cherry_pick_anakin_cpu_1.4
Cherry pick from feature/anakin-engine2
2 parents 0f18fbf + 4b9fa42 commit 9f1927b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+2238
-874
lines changed

cmake/anakin_subgraph.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ endif()
2525

2626
if(ANAKIN_FOUND)
2727
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
28+
include_directories(${ANAKIN_ROOT})
2829
include_directories(${ANAKIN_ROOT}/include)
29-
include_directories(${ANAKIN_ROOT}/include/saber)
30+
include_directories(${ANAKIN_ROOT}/saber)
3031
link_directories(${ANAKIN_ROOT})
3132
add_definitions(-DPADDLE_WITH_ANAKIN)
3233
endif()

paddle/fluid/framework/ir/fc_fuse_pass.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,37 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
4848
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
4949
GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
5050

51+
auto base_op_desc = mul->Op();
5152
// Create an FC Node.
53+
// OpDesc desc(base_op_desc, nullptr);
5254
OpDesc desc;
5355
std::string fc_x_in = subgraph.at(x)->Name();
5456
std::string fc_Y_in = w->Name();
5557
std::string fc_bias_in = fc_bias->Name();
5658
std::string fc_out_out = fc_out->Name();
59+
5760
desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
5861
desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
5962
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
6063
desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
6164
desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
65+
66+
// For anakin subgraph int8
67+
// When in anakin subgraph int8 mode, the pattern like "fake_quant + mul +
68+
// fake_dequant"
69+
// can be detected by the quant_dequant_fuse_pass. This pass will add
70+
// "input_scale",
71+
// "weight_scale" which are extracted from fake_quant op and fake_dequant op
72+
// to mul op,
73+
// and then delete the fake_quant op and fake_dequant op in the graph. If
74+
// the mul op
75+
// has the scale info, we should add those to the fused fc.
76+
if (base_op_desc->HasAttr("enable_int8")) {
77+
desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8"));
78+
desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale"));
79+
desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale"));
80+
}
81+
6282
desc.SetType("fc");
6383
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
6484
GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out});

paddle/fluid/framework/ir/graph_pattern_detector.cc

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1640,32 +1640,31 @@ PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
16401640
void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
16411641
const std::string &op_type,
16421642
const std::string &weight_name,
1643-
int times) {
1643+
int times,
1644+
const std::string &quant_type) {
16441645
const int kNumFields = 5;
16451646
const int kQuantizedWeightOffset = 0;
16461647
const int kQuantizedOpOffset = 1;
16471648
const int kQuantizedOpOutOffset = 2;
16481649
const int kDequantOpOffset = 3;
16491650
const int kDequantOpOutOffset = 4;
16501651
// the quant op always be one.
1651-
auto quant_op_in_scale =
1652-
pattern->NewNode(GetNodeName("quant_op_in_scale"))
1653-
->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
1654-
->AsInput();
1655-
auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
1656-
->assert_is_op("fake_quantize_range_abs_max");
1652+
auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
1653+
->assert_is_op_input(quant_type, "InScale")
1654+
->AsInput();
1655+
auto quant_op =
1656+
pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
16571657

16581658
auto quant_op_out_scale =
16591659
pattern->NewNode(GetNodeName("quant_op_out_scale"))
1660-
->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
1660+
->assert_is_op_output(quant_type, "OutScale")
16611661
->assert_is_op_input("fake_dequantize_max_abs", "Scale")
16621662
->AsIntermediate();
16631663

1664-
auto quant_op_out =
1665-
pattern->NewNode(GetNodeName("quant_op_out"))
1666-
->assert_is_op_output("fake_quantize_range_abs_max", "Out")
1667-
->assert_is_op_input(op_type)
1668-
->AsIntermediate();
1664+
auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
1665+
->assert_is_op_output(quant_type, "Out")
1666+
->assert_is_op_input(op_type)
1667+
->AsIntermediate();
16691668

16701669
// there are 'times' quantized and dequant op
16711670
std::vector<PDNode *> nodes;

paddle/fluid/framework/ir/graph_pattern_detector.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,8 @@ struct QuantDequantOpFuse : public PatternBase {
880880
: PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
881881

882882
void operator()(PDNode* quant_op_input, const std::string& op_name,
883-
const std::string& weight_name, int times = 1);
883+
const std::string& weight_name, int times,
884+
const std::string& quant_type);
884885

885886
std::string GetNodeName(const std::string& op_type) {
886887
return PDNodeName(name_scope_, repr_, id_, op_type);

paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ namespace framework {
2525
namespace ir {
2626

2727
void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
28-
std::string op_type) {
28+
const std::string& op_type,
29+
const std::string& quant_type) {
2930
const std::string pattern_name = "quant_dequant_fuse";
3031
// FusePassBase::Init(pattern_name, graph);
3132
const int kNumFields = 5;
@@ -38,14 +39,17 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
3839
GraphPatternDetector gpd;
3940
auto* x = gpd.mutable_pattern()
4041
->NewNode("x")
41-
->assert_is_op_input("fake_quantize_range_abs_max", "X")
42+
->assert_is_op_input(quant_type, "X")
4243
->AsInput();
4344

4445
std::string quantized_op_type = "";
4546
std::string weight_name = "";
4647
if (op_type == "conv2d") {
4748
quantized_op_type = "conv2d";
4849
weight_name = "Filter";
50+
} else if (op_type == "depthwise_conv2d") {
51+
quantized_op_type = "depthwise_conv2d";
52+
weight_name = "Filter";
4953
} else if (op_type == "conv2d_fusion") {
5054
quantized_op_type = "conv2d_fusion";
5155
weight_name = "Filter";
@@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
6266
}
6367

6468
patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
65-
pattern(x, quantized_op_type, weight_name, times);
69+
pattern(x, quantized_op_type, weight_name, times, quant_type);
6670

6771
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
6872
Graph* g) {
@@ -103,7 +107,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
103107
std::unordered_set<const Node*> delete_nodes;
104108

105109
for (int i = 0; i < times; i++) {
106-
// max_range = (range * range) / weight_scale
107110
float max_range = boost::get<float>(
108111
nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
109112
float weight_scale = (range * range) / max_range;
@@ -118,7 +121,8 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
118121
new_op_desc.SetType(quantized_op_type);
119122

120123
if (quantized_op_type == "conv2d" ||
121-
quantized_op_type == "conv2d_fusion") {
124+
quantized_op_type == "conv2d_fusion" ||
125+
quantized_op_type == "depthwise_conv2d") {
122126
new_op_desc.SetInput("Input", {new_input});
123127
new_op_desc.SetOutput("Output", {new_output});
124128
} else if (quantized_op_type == "fc") {
@@ -156,11 +160,17 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
156160
const std::string pattern_name = "quant_dequant_fuse";
157161
FusePassBase::Init(pattern_name, graph);
158162

159-
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
163+
std::unordered_set<std::string> quant_types = {
164+
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
165+
166+
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
167+
"depthwise_conv2d"};
160168
auto* scope = param_scope();
161-
for (auto& op_type : quantized_op_types) {
162-
for (int i = 1; i <= 6; i++) {
163-
RunQuantDequant(graph, scope, i, op_type);
169+
for (auto& quant_type : quant_types) {
170+
for (auto& op_type : quantized_op_types) {
171+
for (int i = 6; i >= 1; i--) {
172+
RunQuantDequant(graph, scope, i, op_type, quant_type);
173+
}
164174
}
165175
}
166176
}

paddle/fluid/inference/anakin/convert/CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
1+
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
2+
elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc
3+
batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
4+
detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc
5+
roi_align.cc helper.cc DEPS anakin_engine framework_proto scope op_registry
6+
gtest)
27

38
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
49
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
@@ -14,5 +19,5 @@ cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter fla
1419
cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
1520
cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
1621
cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
17-
#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
1822
cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL)
23+
cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op SERIAL)

paddle/fluid/inference/anakin/convert/activation.cc

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,41 +16,74 @@
1616
#include <algorithm>
1717
#include <map>
1818

19-
using anakin::graph::GraphGlobalMem;
20-
using anakin::AK_FLOAT;
21-
using anakin::saber::NV;
22-
using anakin::saber::Shape;
23-
2419
namespace paddle {
2520
namespace inference {
2621
namespace anakin {
2722

28-
ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
23+
template <typename TargetT, ::anakin::Precision PrecisionT>
24+
ActivationOpConverter<TargetT, PrecisionT>::ActivationOpConverter(
25+
const std::string &op_type)
2926
: op_type_(op_type) {
3027
auto it = anakin_op_types_.find(op_type_);
3128
PADDLE_ENFORCE(it != anakin_op_types_.end(),
3229
"activation op type is not support");
3330
anakin_op_type_ = it->second;
3431
}
3532

36-
void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
37-
const framework::BlockDesc &block_desc,
38-
const framework::Scope &scope,
39-
bool test_mode) {
33+
template <typename TargetT, ::anakin::Precision PrecisionT>
34+
void ActivationOpConverter<TargetT, PrecisionT>::operator()(
35+
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
36+
const framework::Scope &scope, bool test_mode) {
4037
framework::OpDesc op_desc(op, nullptr);
4138
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
4239
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
4340

4441
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
4542
auto input_name = op_desc.Input("X").front();
4643
auto output_name = op_desc.Output("Out").front();
47-
engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
48-
engine_->AddOpAttr(op_name, "type", anakin_op_type_);
44+
this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
45+
this->engine_->AddOpAttr(op_name, "type", anakin_op_type_);
4946
}
5047

5148
} // namespace anakin
5249
} // namespace inference
5350
} // namespace paddle
5451

55-
REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
56-
REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
52+
#ifdef PADDLE_WITH_CUDA
53+
using sigmoid_nv_fp32 =
54+
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
55+
::anakin::Precision::FP32>;
56+
using sigmoid_nv_int8 =
57+
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
58+
::anakin::Precision::INT8>;
59+
using tanh_nv_fp32 =
60+
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
61+
::anakin::Precision::FP32>;
62+
using tanh_nv_int8 =
63+
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
64+
::anakin::Precision::INT8>;
65+
66+
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_fp32);
67+
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_int8);
68+
REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, tanh_nv_fp32);
69+
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_nv_int8);
70+
#endif
71+
72+
using sigmoid_cpu_fp32 =
73+
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
74+
::anakin::Precision::FP32>;
75+
using sigmoid_cpu_int8 =
76+
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
77+
::anakin::Precision::INT8>;
78+
using tanh_cpu_fp32 =
79+
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
80+
::anakin::Precision::FP32>;
81+
using tanh_cpu_int8 =
82+
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
83+
::anakin::Precision::INT8>;
84+
85+
REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_fp32);
86+
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_int8);
87+
88+
REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_fp32);
89+
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_int8);

paddle/fluid/inference/anakin/convert/activation.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ namespace paddle {
2222
namespace inference {
2323
namespace anakin {
2424

25-
class ActivationOpConverter : public AnakinOpConverter {
25+
template <typename TargetT, ::anakin::Precision PrecisionT>
26+
class ActivationOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
2627
public:
2728
explicit ActivationOpConverter(const std::string &op_type);
2829

@@ -39,14 +40,17 @@ class ActivationOpConverter : public AnakinOpConverter {
3940
{"sigmoid", "Sigmoid"}};
4041
};
4142

42-
class TanhOpConverter : public ActivationOpConverter {
43+
template <typename TargetT, ::anakin::Precision PrecisionT>
44+
class TanhOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
4345
public:
44-
TanhOpConverter() : ActivationOpConverter("tanh") {}
46+
TanhOpConverter() : ActivationOpConverter<TargetT, PrecisionT>("tanh") {}
4547
};
4648

47-
class SigmoidOpConverter : public ActivationOpConverter {
49+
template <typename TargetT, ::anakin::Precision PrecisionT>
50+
class SigmoidOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
4851
public:
49-
SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
52+
SigmoidOpConverter()
53+
: ActivationOpConverter<TargetT, PrecisionT>("sigmoid") {}
5054
};
5155
} // namespace anakin
5256
} // namespace inference
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/inference/anakin/convert/affine_channel.h"
16+
#include <algorithm>
17+
#include <string>
18+
#include <vector>
19+
#include "paddle/fluid/inference/anakin/convert/helper.h"
20+
21+
namespace paddle {
22+
namespace inference {
23+
namespace anakin {
24+
25+
template <typename TargetT, ::anakin::Precision PrecisionT>
26+
void AffineChannelOpConverter<TargetT, PrecisionT>::operator()(
27+
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
28+
const framework::Scope &scope, bool test_mode) {
29+
framework::OpDesc op_desc(op, nullptr);
30+
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
31+
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
32+
33+
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
34+
auto input_name = op_desc.Input("X").front();
35+
auto output_name = op_desc.Output("Out").front();
36+
this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
37+
38+
// Copy the Scale to CPUPlace and get the pointer.
39+
auto *scale_v = scope.FindVar(op_desc.Input("Scale").front());
40+
PADDLE_ENFORCE_NOT_NULL(scale_v);
41+
auto weight1 = pblock_from_var<TargetT, PrecisionT>(*scale_v, this->engine_);
42+
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
43+
44+
// Copy the Bias to CPUPlace and get the pointer.
45+
auto *bias_v = scope.FindVar(op_desc.Input("Bias").front());
46+
PADDLE_ENFORCE_NOT_NULL(bias_v);
47+
auto weight2 = pblock_from_var<TargetT, PrecisionT>(*bias_v, this->engine_);
48+
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
49+
}
50+
51+
} // namespace anakin
52+
} // namespace inference
53+
} // namespace paddle
54+
55+
#ifdef PADDLE_WITH_CUDA
56+
using affine_channel_nv_fp32 =
57+
::paddle::inference::anakin::AffineChannelOpConverter<
58+
::anakin::saber::NV, ::anakin::Precision::FP32>;
59+
using affine_channel_nv_int8 =
60+
::paddle::inference::anakin::AffineChannelOpConverter<
61+
::anakin::saber::NV, ::anakin::Precision::INT8>;
62+
REGISTER_CUDA_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_fp32);
63+
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_int8);
64+
#endif
65+
66+
using affine_channel_cpu_fp32 =
67+
::paddle::inference::anakin::AffineChannelOpConverter<
68+
::anakin::saber::X86, ::anakin::Precision::FP32>;
69+
using affine_channel_cpu_int8 =
70+
::paddle::inference::anakin::AffineChannelOpConverter<
71+
::anakin::saber::X86, ::anakin::Precision::INT8>;
72+
REGISTER_CPU_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_fp32);
73+
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_int8);

0 commit comments

Comments
 (0)