Skip to content

Commit bc12c2c

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into port_python3_syntax
2 parents 5656f64 + 445ca3d commit bc12c2c

File tree

78 files changed

+2971
-481
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+2971
-481
lines changed

.travis.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,6 @@ script:
2727
# 43min timeout
2828
paddle/scripts/paddle_docker_build.sh ${JOB}
2929
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
30-
- |
31-
if [[ "$JOB" != "doc" ]]; then exit 0; fi;
32-
# For document only
33-
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
34-
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
35-
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
36-
export DOCS_DIR=`pwd`
37-
cd ..
38-
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
3930
notifications:
4031
email:
4132
on_success: change

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
6565
option(WITH_ANAKIN "Compile with Anakin library" OFF)
6666
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
6767
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
68+
option(WITH_INFERENCE "Compile fluid inference library" ON)
6869
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
6970
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
7071

@@ -175,6 +176,7 @@ include(external/any) # download libn::any
175176
include(external/eigen) # download eigen3
176177
include(external/pybind11) # download pybind11
177178
include(external/cares)
179+
include(external/cub)
178180

179181
if(WITH_DISTRIBUTE)
180182
if(WITH_GRPC)

cmake/external/cub.cmake

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
if(NOT WITH_GPU)
2+
return()
3+
endif()
4+
5+
include(ExternalProject)
6+
7+
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
8+
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
9+
10+
include_directories(${CUB_INCLUDE_DIR})
11+
12+
ExternalProject_Add(
13+
extern_cub
14+
${EXTERNAL_PROJECT_LOG_ARGS}
15+
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
16+
GIT_TAG "v1.8.0"
17+
PREFIX ${CUB_SOURCE_DIR}
18+
UPDATE_COMMAND ""
19+
CONFIGURE_COMMAND ""
20+
BUILD_COMMAND ""
21+
INSTALL_COMMAND ""
22+
TEST_COMMAND ""
23+
)
24+
25+
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
26+
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
27+
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
28+
add_library(cub STATIC ${dummyfile})
29+
else()
30+
add_library(cub INTERFACE)
31+
endif()
32+
33+
add_dependencies(cub extern_cub)
34+
35+
LIST(APPEND externl_project_dependencies cub)

cmake/generic.cmake

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
264264
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
265265
if (${cc_test_SERIAL})
266266
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
267+
268+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
267269
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
270+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
268271
endif()
269272
endif()
270273
endfunction(cc_test)
@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
329332
add_test(${TARGET_NAME} ${TARGET_NAME})
330333
if (nv_test_SERIAL)
331334
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
335+
336+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
332337
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
338+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
333339
endif()
334340
endif()
335341
endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
577583
set(multiValueArgs SRCS DEPS ARGS ENVS)
578584
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
579585
add_test(NAME ${TARGET_NAME}
580-
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
586+
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
587+
FLAGS_cpu_deterministic=true
588+
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
581589
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
582590
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
583591
endif()

doc/fluid/howto/optimization/timeline_cn.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,27 @@
11
# 如何使用timeline工具做性能分析
22

3-
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
3+
1. 在训练的主循环外加上`profiler.start_profiler(...)``profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
44

55
**提示:**
66
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
77

88
```python
9-
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
10-
for pass_id in range(pass_num):
11-
for batch_id, data in enumerate(train_reader()):
12-
exe.run(fluid.default_main_program(),
13-
feed=feeder.feed(data),
14-
fetch_list=[])
9+
for pass_id in range(pass_num):
10+
for batch_id, data in enumerate(train_reader()):
11+
if pass_id == 0 and batch_id == 5:
12+
profiler.start_profiler("All")
13+
elif pass_id == 0 and batch_id == 10:
14+
profiler.stop_profiler("total", "/tmp/profile")
15+
exe.run(fluid.default_main_program(),
16+
feed=feeder.feed(data),
17+
fetch_list=[])
1518
...
1619
```
1720

1821
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
22+
```python
23+
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
24+
```
1925

2026
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
2127

doc/fluid/howto/optimization/timeline_en.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
# how to use timeline tool to do profile
22

3-
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
3+
1. Add `profiler.start_profiler(...)``profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
44

55
```python
6-
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
7-
for pass_id in range(pass_num):
8-
for batch_id, data in enumerate(train_reader()):
9-
exe.run(fluid.default_main_program(),
10-
feed=feeder.feed(data),
11-
fetch_list=[],
12-
use_program_cache=True)
6+
for pass_id in range(pass_num):
7+
for batch_id, data in enumerate(train_reader()):
8+
if pass_id == 0 and batch_id == 5:
9+
profiler.start_profiler("All")
10+
elif pass_id == 0 and batch_id == 10:
11+
profiler.stop_profiler("total", "/tmp/profile")
12+
exe.run(fluid.default_main_program(),
13+
feed=feeder.feed(data),
14+
fetch_list=[])
1315
...
1416
```
1517

1618
1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
1719
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
1820
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
1921

22+
```python
23+
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
24+
```
25+
2026
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
2127

2228
![chrome tracing](./tracing.jpeg)

doc/survey/op_fusion_design.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Operator fusion
2+
Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.
3+
4+
There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.
5+
6+
## Challenge
7+
The challenge of fusing operators is:
8+
- how to make the rules.
9+
- how to implement these rules efficiently.
10+
11+
### How to make the rules?
12+
13+
The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
14+
15+
### How to implement these rules efficiently?
16+
#### How to fuse the adjacent operations efficiently?
17+
Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
18+
19+
#### How to fuse the operators that have the same function efficiently?
20+
We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.

paddle/fluid/API.spec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
336336
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
337337
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
338338
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
339+
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
339340
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
340341
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
341342
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

paddle/fluid/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ add_subdirectory(operators)
55
add_subdirectory(pybind)
66
add_subdirectory(string)
77
add_subdirectory(recordio)
8-
# NOTE: please add subdirectory inference at last.
9-
add_subdirectory(inference)
8+
if(WITH_INFERENCE)
9+
# NOTE: please add subdirectory inference at last.
10+
add_subdirectory(inference)
11+
endif()

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "paddle/fluid/framework/details/container_cast.h"
1818
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1919
#include "paddle/fluid/framework/details/variable_visitor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
4546
#endif
4647

4748
void AllReduceOpHandle::RunImpl() {
49+
platform::RecordEvent r("all_reduce", nullptr);
4850
if (NoDummyInputSize() == 1) {
4951
return; // No need to all reduce when GPU count = 1;
5052
} else {

0 commit comments

Comments
 (0)