Skip to content

Commit e63013a

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_reduce_op_handle
2 parents 1eeb2e0 + 61f4baa commit e63013a

File tree

8 files changed

+304
-20
lines changed

8 files changed

+304
-20
lines changed

paddle/fluid/framework/details/reduce_op_handle.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ void ReduceOpHandle::RunImpl() {
9191
if (paddle::platform::is_cpu_place(pre_place)) {
9292
ReduceLoDTensor func(lod_tensors, trg);
9393
VisitDataType(ToDataType(lod_tensors[0].type()), func);
94-
9594
} else if (paddle::platform::is_gpu_place(pre_place)) {
9695
#ifdef PADDLE_WITH_CUDA
9796
auto out_p = out_var_handles[0]->place_;

paddle/fluid/operators/conv_mkldnn_op.cc

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
7272
auto dst_md = platform::MKLDNNMemDesc(
7373
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
7474

75-
auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
76-
reinterpret_cast<void*>(input_data));
77-
auto weights_memory = mkldnn::memory({weights_md, mkldnn_engine},
78-
reinterpret_cast<void*>(filter_data));
75+
auto src_memory =
76+
mkldnn::memory({src_md, mkldnn_engine},
77+
reinterpret_cast<void*>(const_cast<T*>(input_data)));
78+
auto weights_memory =
79+
mkldnn::memory({weights_md, mkldnn_engine},
80+
reinterpret_cast<void*>(const_cast<T*>(filter_data)));
7981
auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
8082

8183
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@@ -180,9 +182,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
180182
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
181183

182184
// create memory
183-
auto diff_dst_memory =
184-
mkldnn::memory({diff_weights_md, mkldnn_engine},
185-
reinterpret_cast<void*>(output_grad_data));
185+
auto diff_dst_memory = mkldnn::memory(
186+
{diff_weights_md, mkldnn_engine},
187+
reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
186188
// Retrieve conv_pd from device context
187189
auto conv_pd =
188190
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
@@ -202,8 +204,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
202204
auto diff_weights_memory =
203205
mkldnn::memory({diff_weights_md, mkldnn_engine},
204206
reinterpret_cast<void*>(filter_grad_data));
205-
auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
206-
reinterpret_cast<void*>(input_data));
207+
auto src_memory =
208+
mkldnn::memory({src_md, mkldnn_engine},
209+
reinterpret_cast<void*>(const_cast<T*>(input_data)));
207210

208211
// create backward conv primitive for weights
209212
auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
@@ -222,11 +225,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
222225
strides, paddings, *conv_pd, mkldnn_engine);
223226

224227
// create memory
225-
auto diff_src_memory =
226-
mkldnn::memory({diff_src_md, mkldnn_engine},
227-
reinterpret_cast<void*>(input_grad_data));
228-
auto weights_memory = mkldnn::memory(
229-
{weights_md, mkldnn_engine}, reinterpret_cast<void*>(filter_data));
228+
auto diff_src_memory = mkldnn::memory(
229+
{diff_src_md, mkldnn_engine},
230+
reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
231+
auto weights_memory =
232+
mkldnn::memory({weights_md, mkldnn_engine},
233+
reinterpret_cast<void*>(const_cast<T*>(filter_data)));
230234

231235
// create backward conv primitive for data
232236
auto conv_bwd_data_prim = mkldnn::convolution_backward_data(

paddle/fluid/operators/softmax_mkldnn_op.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,15 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
7373
softmax_dst_memory);
7474
std::vector<primitive> pipeline{softmax};
7575
stream(stream::kind::eager).submit(pipeline).wait();
76+
77+
const bool is_test = ctx.Attr<bool>("is_test");
78+
if (!is_test) {
79+
T threshold = exp(-64);
80+
for (size_t i = 0; i < dst_tz[0] * dst_tz[1]; ++i) {
81+
output_data[i] =
82+
output_data[i] < threshold ? threshold : output_data[i];
83+
}
84+
}
7685
}
7786
};
7887

paddle/fluid/operators/softmax_op.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
9797
AddAttr<bool>("use_mkldnn",
9898
"(bool, default false) Only used in mkldnn kernel")
9999
.SetDefault(false);
100+
AddAttr<bool>("is_test",
101+
"Disable epsilon adding to softmax results. Used by MKLDNN.")
102+
.SetDefault(false);
100103
AddComment(R"DOC(
101104
Softmax Operator.
102105

python/paddle/fluid/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from distribute_transpiler_simple import SimpleDistributeTranspiler
3838
from concurrency import (Go, make_channel, channel_send, channel_recv,
3939
channel_close, Select)
40+
from inference_transpiler import InferenceTranspiler
4041
import clip
4142
from memory_optimization_transpiler import memory_optimize, release_memory
4243
import profiler
@@ -66,6 +67,7 @@
6667
'clip',
6768
'SimpleDistributeTranspiler',
6869
'DistributeTranspiler',
70+
'InferenceTranspiler',
6971
'memory_optimize',
7072
'release_memory',
7173
'profiler',
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import numpy as np
16+
from framework import Program
17+
from executor import global_scope
18+
from . import core
19+
20+
21+
class InferenceTranspiler:
22+
def transpile(self, program, place, scope=None):
23+
'''
24+
Transpile the program. Support only fuse batch normalization now.
25+
26+
:param program: program to transpile
27+
:type program: Program
28+
:param place: inference place
29+
:type place: Place
30+
:param scope: inference scope
31+
:type scope: Scope or None
32+
'''
33+
if not isinstance(program, Program):
34+
raise TypeError("program should be as Program type")
35+
if not isinstance(place, core.CPUPlace) and not isinstance(
36+
place, core.CUDAPlace):
37+
raise TypeError("place should be as CPUPlace/CUDAPlace type")
38+
if scope is None:
39+
scope = global_scope()
40+
if not isinstance(scope, core.Scope):
41+
raise TypeError("scope should be as Scope type or None")
42+
self.fuse_batch_norm(program, place, scope)
43+
44+
def fuse_batch_norm(self, program, place, scope):
45+
'''
46+
Transpile the program by fused batch normalization.
47+
48+
The batch normalization followed the convolution or fully connected layer
49+
can be integrated with them. Doing so will give us a forward acceleration,
50+
especially in environments like mobile or embedded.
51+
52+
For input X:
53+
- Conv process: X = input * W + bias
54+
- Batch norm process: X' = (X - mean) / std
55+
- Scale Process: Y = a * X' + b
56+
57+
After fuse into one operation:
58+
59+
Y = (input * W + bias - mean) / std * a + b
60+
= input * a * W / std + ((bias - mean) / std * a + b)
61+
62+
The operator transformation is:
63+
- before:
64+
- conv->batch_norm->any_other_op (bias == 0)
65+
- conv->elementwise_add->batch_norm->any_other_op (bias != 0)
66+
- after:
67+
- conv->elementwise_add->any_other_op
68+
69+
The transpile stages are:
70+
1. insert elementwise_add op when bias == 0.
71+
2. fuse the batch_norm's parameters to conv and elementwise_add operators.
72+
3. remove batch_norm ops which are not used in any other ops.
73+
4. adjust the input of any_other_op to be the output of elementwise_add operator.
74+
5. remove unused variables.
75+
76+
:param program: program to transpile
77+
:type program: Program
78+
:param place: inference place
79+
:type place: Place
80+
:param scope: inference scope
81+
:type scope: Scope
82+
'''
83+
self.scope = scope
84+
self.place = place
85+
self.block = program.block(0)
86+
self.input_map = {} # store the input names should be adjusted
87+
88+
i = 0
89+
while i < len(self.block.ops):
90+
current_op = self.block.ops[i]
91+
# TODO(luotao1): consider only conv2d now. fc would be delt later.
92+
if current_op.type in ['conv2d']:
93+
# TODO(luotao1): consider single chain network now.
94+
# For branch network, we counldn't use block.ops[i + 1] as
95+
# the judgment condition.
96+
next_op = self.block.ops[i + 1]
97+
# conv2d without bias
98+
if (next_op.type == 'batch_norm'):
99+
# insert bias op
100+
bias_op = self._insert_bias_op(i + 1, current_op, next_op)
101+
# fuse batch_norm
102+
self._fuse_param(current_op, next_op, bias_op, 0)
103+
# remove batch_norm_op
104+
self.block.remove_op(i + 2)
105+
i = i + 1
106+
# conv2d with bias, the next_op.type is elementwise_add
107+
elif (next_op.type == 'elementwise_add'):
108+
next_next_op = self.block.ops[i + 2]
109+
if (next_next_op.type == 'batch_norm'):
110+
# fuse batch_norm
111+
self._fuse_param(current_op, next_next_op, next_op, 1)
112+
# remove batch_norm_op
113+
self.block.remove_op(i + 2)
114+
i = i + 1
115+
i = i + 1
116+
117+
self._adjust_input()
118+
self._remove_unused_var()
119+
# TODO(luotao): use clone() method to flush the program.desc in force,
120+
# since some large program.desc will not be flushed immediately.
121+
# And a better solution will be considered later.
122+
program = program.clone()
123+
124+
# ====================== private transpiler functions =====================
125+
def _insert_bias_op(self, index, current_op, bn_op):
126+
'''
127+
Construct elementwise_add operator for adding bias
128+
and insert it into program.
129+
130+
:param index: insert location of bias_op
131+
:type index: Int
132+
:param current_op: current operator (conv or fc)
133+
:type current_op: Operator
134+
:param bn_op: batch norm operator
135+
:type bn_op: Operator
136+
:return: bias_op
137+
:rtype: Operator
138+
'''
139+
# The input of bias_op is current_op's output and Bias of bn_op
140+
# The output of bias_op is bn_op's output
141+
x_var = self.block.var(current_op.output("Output")[0])
142+
y_var = self.block.var(bn_op.input("Bias")[0])
143+
out_var = self.block.var(bn_op.output("Y")[0])
144+
145+
bias_op = self.block.insert_op(
146+
index,
147+
type="elementwise_add",
148+
inputs={"X": x_var,
149+
"Y": y_var},
150+
outputs={"Out": out_var},
151+
attrs={"axis": 1}) # dim_start=1
152+
return bias_op
153+
154+
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
155+
'''
156+
fuse the batch_norm_op' parameters to current_op (conv or fc)
157+
158+
:param current_op: current operator (conv or fc)
159+
:type current_op: Operator
160+
:param bn_op: batch norm operator
161+
:type bn_op: Operator
162+
:param bias_op: elementwise_add operator for adding bias
163+
:type bias_op: Operator
164+
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
165+
:type with_bias: Int
166+
'''
167+
168+
def _update_param(op, old_param_name, new_param):
169+
# For the sake of remaining the original variables the same as before,
170+
# create new variables in scope to store the new parameters.
171+
old_param_name = old_param_name[0]
172+
old_var = self.block.vars[old_param_name]
173+
new_param_name = old_param_name + '_fuse_bn'
174+
new_var = self.block.create_parameter(
175+
name=new_param_name.encode('ascii'),
176+
type=old_var.type,
177+
dtype=old_var.dtype,
178+
shape=old_var.shape)
179+
op.rename_input(old_param_name, new_param_name)
180+
self.scope.var(new_param_name)
181+
182+
tensor = self.scope.find_var(new_param_name).get_tensor()
183+
tensor.set(np.array(new_param), self.place)
184+
185+
def _load_param(param_name):
186+
return np.array(self.scope.find_var(param_name[0]).get_tensor())
187+
188+
bias_bn = _load_param(bn_op.input("Bias")) #Bias
189+
scale_bn = _load_param(bn_op.input("Scale")) #Scale
190+
mean_bn = _load_param(bn_op.input("Mean")) #Mean
191+
var_bn = _load_param(bn_op.input("Variance")) #Variance
192+
193+
# TODO(luotao1): consider only conv2d now. fc would be delt later.
194+
current_param = _load_param(current_op.input("Filter"))
195+
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
196+
tmp = np.float32(np.divide(scale_bn, std_bn))
197+
198+
# add bias of batch_norm_op to conv2d
199+
if with_bias:
200+
bias = _load_param(bias_op.input("Y"))
201+
else:
202+
bias = np.zeros(bias_bn.shape)
203+
bias = np.float32(
204+
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
205+
206+
# re-compute weight of conv2d
207+
tmp = tmp.reshape(tmp.shape[0], -1)
208+
dst_param = current_param.reshape((tmp.shape[0], -1))
209+
dst_param = np.float32(np.multiply(dst_param, tmp))
210+
dst_param = dst_param.reshape(current_param.shape)
211+
212+
# update parameters
213+
_update_param(current_op, current_op.input("Filter"), dst_param)
214+
_update_param(bias_op, bias_op.input("Y"), bias)
215+
216+
# collect the renamed input
217+
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
218+
219+
def _adjust_input(self):
220+
for i in range(len(self.block.ops)):
221+
current_op = self.block.ops[i]
222+
for input_arg in current_op.input_arg_names:
223+
if input_arg in self.input_map:
224+
current_op.rename_input(input_arg,
225+
self.input_map[input_arg])
226+
227+
def _remove_unused_var(self):
228+
'''
229+
remove unused varibles in program
230+
'''
231+
args = []
232+
for i in range(len(self.block.ops)):
233+
current_op = self.block.ops[i]
234+
args += current_op.input_arg_names
235+
args += current_op.output_arg_names
236+
args = list(set(args)) # unique the input and output arguments
237+
238+
for var in self.block.vars.keys():
239+
if var not in args:
240+
self.block.remove_var(var)

python/paddle/fluid/layers/nn.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def fc(input,
8888
bias_attr=None,
8989
use_mkldnn=False,
9090
act=None,
91+
is_test=False,
9192
name=None):
9293
"""
9394
**Fully Connected Layer**
@@ -134,6 +135,7 @@ def fc(input,
134135
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
135136
of this layer. If it is set to None, no bias will be added to the output units.
136137
act (str, default None): Activation to be applied to the output of this layer.
138+
is_test(bool): A flag indicating whether execution is in test phase.
137139
use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
138140
library is installed. Default: False
139141
name (str, default None): The name of this layer.
@@ -177,8 +179,11 @@ def fc(input,
177179
inputs={"Input": input,
178180
"W": w},
179181
outputs={"Out": tmp},
180-
attrs={"use_mkldnn": use_mkldnn,
181-
"bias_attr": bias_attr})
182+
attrs={
183+
"use_mkldnn": use_mkldnn,
184+
"is_test": is_test,
185+
"bias_attr": bias_attr
186+
})
182187
return helper.append_activation(tmp)
183188
else:
184189
for input_var, param_attr in helper.iter_inputs_and_params():

0 commit comments

Comments
 (0)