Skip to content

Commit e8954a3

Browse files
committed
merge develop
2 parents 32a9e05 + 7970ab9 commit e8954a3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+891
-405
lines changed

.travis.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,6 @@ script:
2727
# 43min timeout
2828
paddle/scripts/paddle_docker_build.sh ${JOB}
2929
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
30-
- |
31-
if [[ "$JOB" != "doc" ]]; then exit 0; fi;
32-
# For document only
33-
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
34-
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
35-
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
36-
export DOCS_DIR=`pwd`
37-
cd ..
38-
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
3930
notifications:
4031
email:
4132
on_success: change

CMakeLists.txt

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,14 @@ include(external/snappy) # download snappy
200200
include(external/snappystream)
201201
include(external/threadpool)
202202

203+
if(WITH_GPU)
204+
include(cuda)
205+
include(tensorrt)
206+
include(external/anakin)
207+
else()
208+
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
209+
endif()
210+
203211
include(cudnn) # set cudnn libraries, must before configure
204212
include(cupti)
205213
include(configure) # add paddle env configuration
@@ -228,14 +236,6 @@ set(EXTERNAL_LIBS
228236
${PYTHON_LIBRARIES}
229237
)
230238

231-
if(WITH_GPU)
232-
include(cuda)
233-
include(tensorrt)
234-
include(external/anakin)
235-
else()
236-
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
237-
endif()
238-
239239
if(WITH_AMD_GPU)
240240
find_package(HIP)
241241
include(hip)

cmake/cudnn.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
2121
${CUDNN_ROOT}/lib64
2222
${CUDNN_ROOT}/lib
2323
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
24+
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
2425
$ENV{CUDNN_ROOT}
2526
$ENV{CUDNN_ROOT}/lib64
2627
$ENV{CUDNN_ROOT}/lib

cmake/generic.cmake

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
264264
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
265265
if (${cc_test_SERIAL})
266266
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
267+
268+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
267269
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
270+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
268271
endif()
269272
endif()
270273
endfunction(cc_test)
@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
329332
add_test(${TARGET_NAME} ${TARGET_NAME})
330333
if (nv_test_SERIAL)
331334
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
335+
336+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
332337
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
338+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
333339
endif()
334340
endif()
335341
endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
577583
set(multiValueArgs SRCS DEPS ARGS ENVS)
578584
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
579585
add_test(NAME ${TARGET_NAME}
580-
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
586+
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
587+
FLAGS_cpu_deterministic=true
588+
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
581589
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
582590
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
583591
endif()

doc/fluid/howto/optimization/timeline_cn.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,27 @@
11
# 如何使用timeline工具做性能分析
22

3-
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
3+
1. 在训练的主循环外加上`profiler.start_profiler(...)``profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
44

55
**提示:**
66
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
77

88
```python
9-
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
10-
for pass_id in range(pass_num):
11-
for batch_id, data in enumerate(train_reader()):
12-
exe.run(fluid.default_main_program(),
13-
feed=feeder.feed(data),
14-
fetch_list=[])
9+
for pass_id in range(pass_num):
10+
for batch_id, data in enumerate(train_reader()):
11+
if pass_id == 0 and batch_id == 5:
12+
profiler.start_profiler("All")
13+
elif pass_id == 0 and batch_id == 10:
14+
profiler.stop_profiler("total", "/tmp/profile")
15+
exe.run(fluid.default_main_program(),
16+
feed=feeder.feed(data),
17+
fetch_list=[])
1518
...
1619
```
1720

1821
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
22+
```python
23+
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
24+
```
1925

2026
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
2127

doc/fluid/howto/optimization/timeline_en.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
# how to use timeline tool to do profile
22

3-
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
3+
1. Add `profiler.start_profiler(...)``profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
44

55
```python
6-
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
7-
for pass_id in range(pass_num):
8-
for batch_id, data in enumerate(train_reader()):
9-
exe.run(fluid.default_main_program(),
10-
feed=feeder.feed(data),
11-
fetch_list=[],
12-
use_program_cache=True)
6+
for pass_id in range(pass_num):
7+
for batch_id, data in enumerate(train_reader()):
8+
if pass_id == 0 and batch_id == 5:
9+
profiler.start_profiler("All")
10+
elif pass_id == 0 and batch_id == 10:
11+
profiler.stop_profiler("total", "/tmp/profile")
12+
exe.run(fluid.default_main_program(),
13+
feed=feeder.feed(data),
14+
fetch_list=[])
1315
...
1416
```
1517

1618
1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
1719
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
1820
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
1921

22+
```python
23+
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
24+
```
25+
2026
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
2127

2228
![chrome tracing](./tracing.jpeg)

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "paddle/fluid/framework/details/container_cast.h"
1818
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1919
#include "paddle/fluid/framework/details/variable_visitor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
4546
#endif
4647

4748
void AllReduceOpHandle::RunImpl() {
49+
platform::RecordEvent r("all_reduce", nullptr);
4850
if (NoDummyInputSize() == 1) {
4951
return; // No need to all reduce when GPU count = 1;
5052
} else {

paddle/fluid/framework/details/build_strategy.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,26 @@ namespace framework {
2121
namespace details {
2222

2323
struct BuildStrategy {
24+
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
25+
// kReduce, for CPU and GPU. If you use kAllReduce, different threads
26+
// optimize their parameters separately. If you use kReduce, the optimizations
27+
// of parameters are distributed to different threads.
28+
// For example, a model has 100 parameters and is running with four threads,
29+
// if you choose kAllReduce, every thread is to optimize 100 parameters
30+
// separately, if you choose kReduce, every thread is to optimize 25
31+
// parameters.
32+
// Of particular note is, if you use kReduce when using CPU training,
33+
// all the parameters are shared between different threads. This feature will
34+
// save memory.
35+
// FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
36+
// equal for GPU. Because, the result of the different order of summing maybe
37+
// different, for example, the result of `a+b+c+d` may be different with the
38+
// result of `c+a+b+d`.
39+
// For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
40+
// so the result of kAllReduce and kReduce maybe not equal.
41+
// For CPU, if you want to fix the order of summing to make the result
42+
// of kAllReduce and kReduce no diff, you can add
43+
// `FLAGS_cpu_deterministic=true` to env.
2444
enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
2545

2646
enum class GradientScaleStrategy {

paddle/fluid/framework/details/reduce_op_handle.cc

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,18 @@
1616
#include "paddle/fluid/framework/details/container_cast.h"
1717
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1818
#include "paddle/fluid/framework/details/variable_visitor.h"
19+
#include "paddle/fluid/platform/profiler.h"
20+
21+
DEFINE_bool(
22+
cpu_deterministic, false,
23+
"Whether to make the result of computation deterministic in CPU side.");
1924

2025
namespace paddle {
2126
namespace framework {
2227
namespace details {
2328

2429
void ReduceOpHandle::RunImpl() {
30+
platform::RecordEvent r("reduce", nullptr);
2531
if (places_.size() == 1) return;
2632
// the input and output may have dummy var.
2733
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
8995
} else {
9096
std::vector<const LoDTensor *> lod_tensors =
9197
GetInputValues<LoDTensor>(in_var_handles, var_scopes);
98+
9299
if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
93100
this->RunAndRecordEvent([&] {
94-
ReduceLoDTensor func(lod_tensors,
95-
out_var->GetMutable<framework::LoDTensor>());
96-
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
101+
// FIXME(zcd): The order of summing is important,
102+
// especially when the type of data is float or double.
103+
// For example, the result of `a+b+c+d` may be different
104+
// with the result of `c+a+b+d`, so the summing order should be fixed.
105+
if (!FLAGS_cpu_deterministic) {
106+
ReduceLoDTensor func(lod_tensors,
107+
out_var->GetMutable<framework::LoDTensor>());
108+
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
109+
} else {
110+
// We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
111+
// here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
112+
auto &reduce_sum_trg = *this->local_scopes_[0]
113+
->FindVar(kLocalExecScopeName)
114+
->Get<Scope *>()
115+
->FindVar(out_var_handle->name_)
116+
->GetMutable<framework::LoDTensor>();
117+
ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
118+
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
119+
120+
auto trg = out_var->GetMutable<framework::LoDTensor>();
121+
if (reduce_sum_trg.data<void>() != trg->data<void>()) {
122+
TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
123+
}
124+
}
97125
});
98126
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
99127
#ifdef PADDLE_WITH_CUDA

paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <string>
1818
#include <vector>
1919
#include "paddle/fluid/framework/executor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
6263
eptr = std::current_exception();
6364
}
6465

66+
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
6567
drop_scope_counter_ += 1;
6668
if (!fetch_tensors.empty() ||
6769
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {

0 commit comments

Comments
 (0)