Skip to content

Commit 274df85

Browse files
committed
Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_send_op
2 parents eb2e68e + bbd7580 commit 274df85

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1107
-610
lines changed

benchmark/fluid/mnist.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def run_benchmark(model, args):
159159
paddle.dataset.mnist.train(), batch_size=args.batch_size)
160160

161161
accuracy = fluid.metrics.Accuracy()
162+
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
162163
iters, num_samples, start_time = 0, 0, time.time()
163164
for pass_id in range(args.pass_num):
164165
accuracy.reset()
@@ -175,17 +176,20 @@ def run_benchmark(model, args):
175176
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
176177
y_data = y_data.reshape([len(y_data), 1])
177178

178-
outs = exe.run(
179-
fluid.default_main_program(),
179+
outs = train_exe.run(
180180
feed={"pixel": img_data,
181181
"label": y_data},
182-
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
182+
fetch_list=[
183+
avg_cost.name, batch_acc.name, batch_size_tensor.name
184+
]
183185
) # The accuracy is the accumulation of batches, but not the current batch.
184-
accuracy.update(value=outs[1], weight=outs[2])
186+
accuracy.update(
187+
value=np.array(np.mean(outs[1])),
188+
weight=np.mean(np.array(outs[2])))
185189
iters += 1
186190
num_samples += len(y_data)
187-
loss = np.array(outs[0])
188-
acc = np.array(outs[1])
191+
loss = np.mean(np.array(outs[0]))
192+
acc = np.mean(np.array(outs[1]))
189193
train_losses.append(loss)
190194
train_accs.append(acc)
191195
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %

benchmark/fluid/resnet.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ def test(exe):
241241
exe = fluid.Executor(place)
242242
exe.run(fluid.default_startup_program())
243243
accuracy = fluid.average.WeightedAverage()
244+
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
244245
if args.use_fake_data:
245246
data = train_reader().next()
246247
image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
@@ -264,14 +265,17 @@ def test(exe):
264265
data)).astype('float32')
265266
label = np.array(map(lambda x: x[1], data)).astype('int64')
266267
label = label.reshape([-1, 1])
267-
loss, acc, weight = exe.run(
268-
fluid.default_main_program(),
268+
loss, acc, weight = train_exe.run(
269269
feed={'data': image,
270270
'label': label},
271-
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
271+
fetch_list=[
272+
avg_cost.name, batch_acc.name, batch_size_tensor.name
273+
])
272274
iters += 1
273275
num_samples += len(label)
274-
accuracy.add(value=acc, weight=weight)
276+
accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
277+
loss = np.mean(np.array(loss))
278+
acc = np.mean(np.array(acc))
275279
train_losses.append(loss)
276280
train_accs.append(acc)
277281
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %

benchmark/fluid/vgg.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def test(exe):
169169

170170
iters, num_samples, start_time = 0, 0, time.time()
171171
accuracy = fluid.average.WeightedAverage()
172+
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
172173
for pass_id in range(args.pass_num):
173174
accuracy.reset()
174175
train_accs = []
@@ -184,14 +185,17 @@ def test(exe):
184185
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
185186
y_data = y_data.reshape([-1, 1])
186187

187-
loss, acc, weight = exe.run(
188-
fluid.default_main_program(),
188+
loss, acc, weight = train_exe.run(
189189
feed={"pixel": img_data,
190190
"label": y_data},
191-
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
192-
accuracy.add(value=acc, weight=weight)
191+
fetch_list=[
192+
avg_cost.name, batch_acc.name, batch_size_tensor.name
193+
])
194+
accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
193195
iters += 1
194196
num_samples += len(y_data)
197+
loss = np.mean(np.array(loss))
198+
acc = np.mean(np.array(acc))
195199
print(
196200
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
197201
(pass_id, iters, loss, acc)

cmake/external/eigen.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ else()
2121
ExternalProject_Add(
2222
extern_eigen3
2323
${EXTERNAL_PROJECT_LOG_ARGS}
24-
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
24+
GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror"
2525
# eigen on cuda9.1 missing header of math_funtions.hpp
2626
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
2727
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c
2828
PREFIX ${EIGEN_SOURCE_DIR}
29+
DOWNLOAD_NAME "eigen"
2930
UPDATE_COMMAND ""
3031
CONFIGURE_COMMAND ""
3132
BUILD_COMMAND ""

cmake/external/snappy.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ ExternalProject_Add(
4747
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
4848
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
4949
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
50-
BUILD_COMMAND make -j8
51-
INSTALL_COMMAND make install
5250
)
5351

5452
add_library(snappy STATIC IMPORTED GLOBAL)

cmake/external/snappystream.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,6 @@ ExternalProject_Add(
4646
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
4747
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
4848
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
49-
BUILD_COMMAND make -j8
50-
INSTALL_COMMAND make install
5149
DEPENDS snappy
5250
)
5351

cmake/inference_lib.cmake

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ copy(glog_lib
7070
DSTS ${dst_dir} ${dst_dir}/lib
7171
)
7272

73+
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/")
74+
copy(boost_lib
75+
SRCS ${BOOST_INCLUDE_DIR}/boost
76+
DSTS ${dst_dir}
77+
)
78+
7379
if(NOT PROTOBUF_FOUND)
7480
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
7581
copy(protobuf_lib
@@ -92,6 +98,14 @@ elseif (WITH_MKLML)
9298
)
9399
endif()
94100

101+
if(WITH_MKLDNN)
102+
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn")
103+
copy(mkldnn_lib
104+
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
105+
DSTS ${dst_dir} ${dst_dir}/lib
106+
)
107+
endif()
108+
95109
if(NOT MOBILE_INFERENCE AND NOT RPI)
96110
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
97111
copy(snappy_lib
@@ -142,4 +156,10 @@ copy(string_lib
142156
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
143157
)
144158

159+
set(module "pybind")
160+
copy(pybind_lib
161+
SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
162+
DSTS ${dst_dir}/${module}
163+
)
164+
145165
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})

doc/fluid/design/concepts/functions_operators_layers.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ template <typename T>
4040
class FCOp : public OperatorBase {
4141
public:
4242
void Run(...) {
43-
add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
43+
add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
4444
}
4545
};
4646
REGISTER_OP(FCOp, "fc");

doc/fluid/design/dist_train/async_update.md

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,37 @@
44

55
For the typical synchronous distributed training, some significant steps are as follows:
66

7-
1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
8-
1. After the PServer node received gradients came from all the Trainers, It will aggregate the
7+
1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
8+
1. After the PS node received gradients came from all the Trainers, It will aggregate the
99
gradient variables for the same parameter into one gradient variable and then apply the aggregated
1010
gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
1111
to update the parameters.
12-
1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
12+
1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
1313
so all the Trainers would get the same parameters.
1414

15-
In the synchronously distributed training, there should be a `Barrier` to synchronise the
16-
parameters after the optimizing stage. The performance of a distributed training job would
17-
depend on the slowest node if there were hundreds or thousands of training nodes in a
18-
Job, the performance of synchronously distributed training might be very poor because of
19-
the slow node. So this design doc would introduce an approach to implement
20-
*asynchronously* distributed training in PaddlePaddle Fluid.
15+
In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
16+
have completed running current mini-batch. After that, all trainers can continue to run the next
17+
mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends
18+
on the slowest node.
19+
20+
In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
21+
the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
22+
train such models that achieve scaling, better throughput. In this design doc, we will introduce how to
23+
implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
2124

2225
## Design
2326

2427
<img src="./src/async_update.png" width="600"/>
2528

26-
As the figure above, we describe a global view of asynchronously update process and use
29+
As the figure above, we describe a global view of the asynchronous update process and use
2730
the parameter `w1` as an example to introduce the steps:
2831
1. For each gradient variables, they may distribute on different GPU card and aggregate
2932
them while they are all calculated.
30-
1. Split the gradient variable into multiple blocks according to the number of PServer
33+
1. Split the gradient variable into multiple blocks according to the number of PS
3134
instances and then send them.
32-
1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
35+
1. PS would run an `Optimize Block` using a specified optimize algorithm to update
3336
the specified parameter.
34-
1. The trainer will fetch latest parameter from PServer before running forward Op which depends
37+
1. The trainer will fetch the latest parameter from PS before running forward Op which depends
3538
on the specified parameter.
3639
1. Broadcast the received variable into multiple GPU cards and continue to run the next
3740
mini-batch.
@@ -40,8 +43,8 @@ mini-batch.
4043

4144
- For the multiple devices distributed training, we need to aggregate the gradient
4245
variables which placed on different devices firstly and then schedule a `SendVars` Operator to
43-
send the gradient variables to the multiple PServer instances.
44-
- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
46+
send the gradient variables to the multiple PS instances.
47+
- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
4548
the forward ops.
4649
- There could be a large number of gradient variables to be sent, so we need to use another
4750
thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the

paddle/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
2424
endif()
2525

2626
add_subdirectory(testing)
27-
if(NOT MOBILE_INFERENCE AND NOT RPI)
27+
if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
2828
add_subdirectory(fluid)
2929
endif()

0 commit comments

Comments
 (0)