Skip to content

Commit 4e07f25

Browse files
jacquesqiaosneaxiy
authored andcommitted
Merge pull request #12295 from jacquesqiao/speedup-reduce-sum-grad-op
Speedup reduce sum grad op
2 parents eec412b + 273f737 commit 4e07f25

File tree

5 files changed

+239
-64
lines changed

5 files changed

+239
-64
lines changed

paddle/fluid/operators/reduce_sum_op.cc

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
2323
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
2424
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
2525
ops::SumFunctor>);
26-
REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
27-
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
28-
float, ops::SumGradFunctor>,
29-
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
30-
double, ops::SumGradFunctor>,
31-
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
32-
int, ops::SumGradFunctor>,
33-
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
34-
int64_t, ops::SumGradFunctor>);
26+
REGISTER_OP_CPU_KERNEL(
27+
reduce_sum_grad,
28+
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, float,
29+
ops::SumGradFunctor>,
30+
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, double,
31+
ops::SumGradFunctor>,
32+
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int,
33+
ops::SumGradFunctor>,
34+
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int64_t,
35+
ops::SumGradFunctor>);

paddle/fluid/operators/reduce_sum_op.h

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,69 @@
1414

1515
#pragma once
1616

17+
#include <vector>
18+
1719
#include "paddle/fluid/operators/reduce_op.h"
1820

1921
namespace paddle {
2022
namespace operators {
2123

24+
// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
25+
template <typename DeviceContext, typename T, typename Functor>
26+
class ReduceSumGradKernel : public framework::OpKernel<T> {
27+
public:
28+
void Compute(const framework::ExecutionContext& context) const override {
29+
auto dims = context.Attr<std::vector<int>>("dim");
30+
if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
31+
dims.size() == 1) {
32+
auto* input0 = context.Input<Tensor>("X");
33+
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
34+
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
35+
output->mutable_data<T>(context.GetPlace());
36+
const auto* input2_d = input2->data<T>();
37+
auto* output_d = output->data<T>();
38+
39+
// handle reduce_all
40+
if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
41+
for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
42+
output_d[i] = input2_d[0];
43+
}
44+
return;
45+
}
46+
47+
// handle reduce by one dimension
48+
int reduce_dim_index = dims[0];
49+
if (reduce_dim_index < 0) {
50+
reduce_dim_index += input0->dims().size();
51+
}
52+
53+
auto& input_dim = input0->dims();
54+
int64_t before_dim = 1;
55+
for (int i = 0; i < reduce_dim_index; ++i) {
56+
before_dim *= input_dim[i];
57+
}
58+
int64_t reduce_dim = input_dim[reduce_dim_index];
59+
int64_t after_dim = 1;
60+
for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
61+
after_dim *= input_dim[i];
62+
}
63+
for (int64_t i = 0; i < before_dim; ++i) {
64+
for (int64_t j = 0; j < reduce_dim; ++j) {
65+
for (int64_t k = 0; k < after_dim; ++k) {
66+
output_d[i * reduce_dim * after_dim + j * after_dim + k] =
67+
input2_d[i * after_dim + k];
68+
}
69+
}
70+
}
71+
return;
72+
}
73+
74+
// default use Eigen broadcast
75+
ReduceGradKernel<DeviceContext, T, Functor> kernel;
76+
kernel.Compute(context);
77+
}
78+
};
79+
2280
struct SumFunctor {
2381
template <typename DeviceContext, typename X, typename Y, typename Dim>
2482
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -31,7 +89,7 @@ struct SumGradFunctor {
3189
typename DY, typename Dim>
3290
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
3391
const Dim& dim, int size) {
34-
dx->device(place) = dy->broadcast(dim);
92+
dx->device(place) = dy->eval().broadcast(dim);
3593
}
3694
};
3795

python/paddle/fluid/layers/io.py

Lines changed: 102 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -456,52 +456,122 @@ def py_reader(capacity,
456456
name=None,
457457
use_double_buffer=True):
458458
"""
459-
Create a reader and blocking queue for data feeding in Python
459+
Create a python reader for data feeding in Python
460460
461-
This layer returns a Reader Variable and a BlockingQueue.
462-
The BlockingQueue provides `push()` method to push a `LoDTensorArray`
463-
object into the queue in Python side. In C++ side, the Reader
464-
Variable would invoke `pop()` method of the queue to retrieve the
465-
feeding data. The process of feeding data in Python side and fetching
466-
data in C++ side can run in parallel. The BlockingQueue should be closed
467-
using `close()` method when unused.
461+
This layer returns a Reader Variable.
462+
The Reader provides :code:`decorate_paddle_reader` and
463+
:code:`decorate_tensor_provider` to set a Python generator as the data
464+
source in Python side. When :code:`Executor::Run()` is invoked in C++
465+
side, the data from the generator would be read automatically. Unlike
466+
:code:`DataFeeder.feed()`, the data reading process and
467+
:code:`Executor::Run()` process can run in parallel using
468+
:code:`py_reader`. The :code:`start()` method of the Reader should be
469+
called when each pass begins, while the :code:`reset()` method should be
470+
called when the pass ends and :code:`fluid.core.EOFException` raises.
471+
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
468472
469473
Args:
470-
use_double_buffer(bool): Whether use double buffer or not.
471-
capacity(int): The maximum capacity of the BlockingQueue.
474+
capacity(int): The buffer capacity maintained by :code:`py_reader`.
472475
shapes(list|tuple): List of tuples which declaring data shapes.
473476
dtypes(list|tuple): List of strs which declaring data type.
474477
lod_levels(list|tuple): List of ints which declaring data lod_level.
475478
name(basestring): The prefix Python queue name and Reader name. None will
476479
be generated automatically.
480+
use_double_buffer(bool): Whether use double buffer or not.
477481
478482
Returns:
479-
tuple(Variable, BlockingQueue):
480-
A Reader Variable from which we can get feeding data.
481-
482-
A BlockingQueue object for data feeding.
483+
Variable: A Reader from which we can get feeding data.
483484
484485
Examples:
485486
486-
.. code-block:: python
487+
1. The basic usage of :code:`py_reader` is as follows:
487488
488-
reader, queue = fluid.layers.py_reader(
489-
capacity=10,
490-
shapes=[[-1,3,224,224], [-1,1]],
491-
dtypes=['float32', 'int64'])
492-
# Via the reader, we can use 'read_file' layer to get data:
493-
image, label = fluid.layers.read_file(reader)
494-
495-
# Via the blocking queue, we can feed data using threads
496-
def feed_data(queue, feed_images, feed_labels):
497-
for feed_image, feed_label in zip(feed_images, feed_labels):
498-
data = core.LoDTensorArray()
499-
data.append(feed_image)
500-
data.append(feed_label)
501-
queue.push(data)
502-
503-
thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
504-
thread.start()
489+
>>> import paddle.v2
490+
>>> import paddle.fluid as fluid
491+
>>> import paddle.dataset.mnist as mnist
492+
>>>
493+
>>> reader = fluid.layers.py_reader(capacity=64,
494+
>>> shapes=[(-1,3,224,224), (-1,1)],
495+
>>> dtypes=['float32', 'int64'])
496+
>>> reader.decorate_paddle_reader(
497+
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train())
498+
>>>
499+
>>> img, label = fluid.layers.read_file(reader)
500+
>>> loss = network(img, label) # some network definition
501+
>>>
502+
>>> fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
503+
>>>
504+
>>> exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
505+
>>> for epoch_id in range(10):
506+
>>> reader.start()
507+
>>> try:
508+
>>> while True:
509+
>>> exe.run(fetch_list=[loss.name])
510+
>>> except fluid.core.EOFException:
511+
>>> reader.reset()
512+
513+
2. When training and testing are both performed, two different
514+
:code:`py_reader` should be created with different names, e.g.:
515+
516+
>>> import paddle.v2
517+
>>> import paddle.fluid as fluid
518+
>>> import paddle.dataset.mnist as mnist
519+
>>>
520+
>>> def network(reader):
521+
>>> img, label = fluid.layers.read_file(reader)
522+
>>> # Here, we omitted the network definition
523+
>>> return loss
524+
>>>
525+
>>> train_reader = fluid.layers.py_reader(capacity=64,
526+
>>> shapes=[(-1,3,224,224), (-1,1)],
527+
>>> dtypes=['float32', 'int64'],
528+
>>> name='train_reader')
529+
>>> train_reader.decorate_paddle_reader(
530+
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train())
531+
>>>
532+
>>> test_reader = fluid.layers.py_reader(capacity=32,
533+
>>> shapes=[(-1,3,224,224), (-1,1)],
534+
>>> dtypes=['float32', 'int64'],
535+
>>> name='test_reader')
536+
>>> test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
537+
>>>
538+
>>> # Create train_main_prog and train_startup_prog
539+
>>> train_main_prog = fluid.Program()
540+
>>> train_startup_prog = fluid.Program()
541+
>>> with fluid.program_guard(train_main_prog, train_startup_prog):
542+
>>> # Use fluid.unique_name.guard() to share parameters with test program
543+
>>> with fluid.unique_name.guard():
544+
>>> train_loss = network(train_reader) # some network definition
545+
>>> adam = fluid.optimizer.Adam(learning_rate=0.01)
546+
>>> adam.minimize(loss)
547+
>>>
548+
>>> # Create test_main_prog and test_startup_prog
549+
>>> test_main_prog = fluid.Program()
550+
>>> test_startup_prog = fluid.Program()
551+
>>> with fluid.program_guard(test_main_prog, test_startup_prog):
552+
>>> # Use fluid.unique_name.guard() to share parameters with train program
553+
>>> with fluid.unique_name.guard():
554+
>>> test_loss = network(test_reader)
555+
>>>
556+
>>> fluid.Executor(fluid.CUDAPlace(0)).run(train_startup_prog)
557+
>>> fluid.Executor(fluid.CUDAPlace(0)).run(test_startup_prog)
558+
>>>
559+
>>> train_exe = fluid.ParallelExecutor(use_cuda=True,
560+
>>> loss_name=train_loss.name, main_program=train_main_prog)
561+
>>> test_exe = fluid.ParallelExecutor(use_cuda=True,
562+
>>> loss_name=test_loss.name, main_program=test_main_prog)
563+
>>> for epoch_id in range(10):
564+
>>> try:
565+
>>> while True:
566+
>>> train_exe.run(fetch_list=[train_loss.name])
567+
>>> except fluid.core.EOFException:
568+
>>> train_reader.reset()
569+
>>>
570+
>>> try:
571+
>>> while True:
572+
>>> test_exe.run(fetch_list=[test_loss.name])
573+
>>> except fluid.core.EOFException:
574+
>>> test_reader.reset()
505575
"""
506576
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
507577
shape_concat = []

python/paddle/fluid/layers/nn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2961,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
29612961
# x is a Tensor variable with following elements:
29622962
# [[0.2, 0.3, 0.5, 0.9]
29632963
# [0.1, 0.2, 0.6, 0.7]]
2964-
# Each example is followed by the correspending output tensor.
2964+
# Each example is followed by the corresponding output tensor.
29652965
fluid.layers.reduce_sum(x) # [3.5]
29662966
fluid.layers.reduce_sum(x, dim=0) # [0.3, 0.5, 1.1, 1.6]
29672967
fluid.layers.reduce_sum(x, dim=-1) # [1.9, 1.6]
@@ -2970,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
29702970
# x is a Tensor variable with shape [2, 2, 2] and elements as below:
29712971
# [[[1, 2], [3, 4]],
29722972
# [[5, 6], [7, 8]]]
2973-
# Each example is followed by the correspending output tensor.
2973+
# Each example is followed by the corresponding output tensor.
29742974
fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
29752975
fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
29762976

python/paddle/fluid/tests/unittests/test_reduce_op.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,11 @@ def test_check_grad(self):
8989
self.check_grad(['X'], 'Out')
9090

9191

92-
class TestKeepDimReduce(OpTest):
92+
class Test1DReduce(OpTest):
9393
def setUp(self):
9494
self.op_type = "reduce_sum"
95-
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
96-
self.attrs = {'dim': [-2], 'keep_dim': True}
97-
self.outputs = {
98-
'Out':
99-
self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
100-
}
95+
self.inputs = {'X': np.random.random(20).astype("float64")}
96+
self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
10197

10298
def test_check_output(self):
10399
self.check_output()
@@ -106,32 +102,82 @@ def test_check_grad(self):
106102
self.check_grad(['X'], 'Out')
107103

108104

109-
class Test1DReduce(OpTest):
105+
class Test2DReduce0(Test1DReduce):
110106
def setUp(self):
111107
self.op_type = "reduce_sum"
112-
self.inputs = {'X': np.random.random(20).astype("float64")}
108+
self.attrs = {'dim': [0]}
109+
self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
113110
self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
114111

115-
def test_check_output(self):
116-
self.check_output()
117112

118-
def test_check_grad(self):
119-
self.check_grad(['X'], 'Out')
113+
class Test2DReduce1(Test1DReduce):
114+
def setUp(self):
115+
self.op_type = "reduce_sum"
116+
self.attrs = {'dim': [1]}
117+
self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
118+
self.outputs = {
119+
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
120+
}
120121

121122

122-
class TestReduceAll(OpTest):
123+
class Test3DReduce0(Test1DReduce):
124+
def setUp(self):
125+
self.op_type = "reduce_sum"
126+
self.attrs = {'dim': [1]}
127+
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
128+
self.outputs = {
129+
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
130+
}
131+
132+
133+
class Test3DReduce1(Test1DReduce):
134+
def setUp(self):
135+
self.op_type = "reduce_sum"
136+
self.attrs = {'dim': [2]}
137+
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
138+
self.outputs = {
139+
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
140+
}
141+
142+
143+
class Test3DReduce2(Test1DReduce):
144+
def setUp(self):
145+
self.op_type = "reduce_sum"
146+
self.attrs = {'dim': [-2]}
147+
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
148+
self.outputs = {
149+
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
150+
}
151+
152+
153+
class Test3DReduce3(Test1DReduce):
154+
def setUp(self):
155+
self.op_type = "reduce_sum"
156+
self.attrs = {'dim': [1, 2]}
157+
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
158+
self.outputs = {
159+
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
160+
}
161+
162+
163+
class TestKeepDimReduce(Test1DReduce):
164+
def setUp(self):
165+
self.op_type = "reduce_sum"
166+
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
167+
self.attrs = {'dim': [1], 'keep_dim': True}
168+
self.outputs = {
169+
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
170+
keepdims=self.attrs['keep_dim'])
171+
}
172+
173+
174+
class TestReduceAll(Test1DReduce):
123175
def setUp(self):
124176
self.op_type = "reduce_sum"
125177
self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
126178
self.attrs = {'reduce_all': True}
127179
self.outputs = {'Out': self.inputs['X'].sum()}
128180

129-
def test_check_output(self):
130-
self.check_output()
131-
132-
def test_check_grad(self):
133-
self.check_grad(['X'], 'Out')
134-
135181

136182
## reduction in multi dims
137183
class TestReduceMeanOpMultiAxises(OpTest):

0 commit comments

Comments
 (0)