Skip to content

Commit 3bf3e77

Browse files
committed
Merge remote-tracking branch 'ups/develop' into refine/op/gru
2 parents 8360685 + 16b65c5 commit 3bf3e77

File tree

233 files changed

+4155
-1374
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

233 files changed

+4155
-1374
lines changed

.travis.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,6 @@ script:
2727
# 43min timeout
2828
paddle/scripts/paddle_docker_build.sh ${JOB}
2929
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
30-
- |
31-
if [[ "$JOB" != "doc" ]]; then exit 0; fi;
32-
# For document only
33-
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
34-
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
35-
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
36-
export DOCS_DIR=`pwd`
37-
cd ..
38-
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
3930
notifications:
4031
email:
4132
on_success: change

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,15 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
6565
option(WITH_ANAKIN "Compile with Anakin library" OFF)
6666
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
6767
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
68+
option(WITH_INFERENCE "Compile fluid inference library" ON)
6869
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
6970
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
7071

7172
# PY_VERSION
7273
if(NOT PY_VERSION)
7374
set(PY_VERSION 2.7)
7475
endif()
76+
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
7577

7678
# CMAKE_BUILD_TYPE
7779
if(NOT CMAKE_BUILD_TYPE)
@@ -158,6 +160,7 @@ endif()
158160
########################################################################################
159161

160162
include(external/mklml) # download mklml package
163+
include(external/xbyak) # download xbyak package
161164
include(external/libxsmm) # download, build, install libxsmm
162165
include(external/zlib) # download, build, install zlib
163166
include(external/gflags) # download, build, install gflags
@@ -174,6 +177,7 @@ include(external/any) # download libn::any
174177
include(external/eigen) # download eigen3
175178
include(external/pybind11) # download pybind11
176179
include(external/cares)
180+
include(external/cub)
177181

178182
if(WITH_DISTRIBUTE)
179183
if(WITH_GRPC)

cmake/external/cub.cmake

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
if(NOT WITH_GPU)
2+
return()
3+
endif()
4+
5+
include(ExternalProject)
6+
7+
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
8+
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
9+
10+
include_directories(${CUB_INCLUDE_DIR})
11+
12+
ExternalProject_Add(
13+
extern_cub
14+
${EXTERNAL_PROJECT_LOG_ARGS}
15+
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
16+
GIT_TAG "v1.8.0"
17+
PREFIX ${CUB_SOURCE_DIR}
18+
UPDATE_COMMAND ""
19+
CONFIGURE_COMMAND ""
20+
BUILD_COMMAND ""
21+
INSTALL_COMMAND ""
22+
TEST_COMMAND ""
23+
)
24+
25+
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
26+
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
27+
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
28+
add_library(cub STATIC ${dummyfile})
29+
else()
30+
add_library(cub INTERFACE)
31+
endif()
32+
33+
add_dependencies(cub extern_cub)
34+
35+
LIST(APPEND externl_project_dependencies cub)

cmake/external/xbyak.cmake

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
set(WITH_XBYAK ON)
16+
if(WIN32 OR APPLE)
17+
SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
18+
return()
19+
endif()
20+
21+
include(ExternalProject)
22+
23+
set(XBYAK_PROJECT extern_xbyak)
24+
set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
25+
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
26+
set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
27+
28+
include_directories(${XBYAK_INC_DIR})
29+
include_directories(${XBYAK_INC_DIR}/xbyak)
30+
31+
add_definitions(-DPADDLE_WITH_XBYAK)
32+
33+
# xbyak options
34+
add_definitions(-DXBYAK64)
35+
add_definitions(-DXBYAK_NO_OP_NAMES)
36+
37+
ExternalProject_Add(
38+
${XBYAK_PROJECT}
39+
${EXTERNAL_PROJECT_LOG_ARGS}
40+
DEPENDS ""
41+
GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
42+
GIT_TAG "v5.661" # Jul 26th
43+
PREFIX ${XBYAK_PREFIX_DIR}
44+
UPDATE_COMMAND ""
45+
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
46+
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
47+
)
48+
49+
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
50+
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
51+
file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
52+
add_library(xbyak STATIC ${dummyfile})
53+
else()
54+
add_library(xbyak INTERFACE)
55+
endif()
56+
57+
add_dependencies(xbyak ${XBYAK_PROJECT})
58+
list(APPEND external_project_dependencies xbyak)

cmake/generic.cmake

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
264264
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
265265
if (${cc_test_SERIAL})
266266
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
267+
268+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
267269
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
270+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
268271
endif()
269272
endif()
270273
endfunction(cc_test)
@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
329332
add_test(${TARGET_NAME} ${TARGET_NAME})
330333
if (nv_test_SERIAL)
331334
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
335+
336+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
332337
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
338+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
333339
endif()
334340
endif()
335341
endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
577583
set(multiValueArgs SRCS DEPS ARGS ENVS)
578584
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
579585
add_test(NAME ${TARGET_NAME}
580-
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
586+
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
587+
FLAGS_cpu_deterministic=true
588+
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
581589
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
582590
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
583591
endif()

doc/survey/op_fusion_design.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Operator fusion
2+
Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.
3+
4+
There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.
5+
6+
## Challenge
7+
The challenge of fusing operators is:
8+
- how to make the rules.
9+
- how to implement these rules efficiently.
10+
11+
### How to make the rules?
12+
13+
The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
14+
15+
### How to implement these rules efficiently?
16+
#### How to fuse the adjacent operations efficiently?
17+
Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
18+
19+
#### How to fuse the operators that have the same function efficiently?
20+
We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.

paddle/fluid/API.spec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
336336
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
337337
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
338338
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
339+
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
339340
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
340341
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
341342
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

paddle/fluid/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ add_subdirectory(operators)
55
add_subdirectory(pybind)
66
add_subdirectory(string)
77
add_subdirectory(recordio)
8-
# NOTE: please add subdirectory inference at last.
9-
add_subdirectory(inference)
8+
if(WITH_INFERENCE)
9+
# NOTE: please add subdirectory inference at last.
10+
add_subdirectory(inference)
11+
endif()

paddle/fluid/framework/details/build_strategy.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,26 @@ namespace framework {
2121
namespace details {
2222

2323
struct BuildStrategy {
24+
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
25+
// kReduce, for CPU and GPU. If you use kAllReduce, different threads
26+
// optimize their parameters separately. If you use kReduce, the optimizations
27+
// of parameters are distributed to different threads.
28+
// For example, a model has 100 parameters and is running with four threads,
29+
// if you choose kAllReduce, every thread is to optimize 100 parameters
30+
// separately, if you choose kReduce, every thread is to optimize 25
31+
// parameters.
32+
// Of particular note is, if you use kReduce when using CPU training,
33+
// all the parameters are shared between different threads. This feature will
34+
// save memory.
35+
// FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
36+
// equal for GPU. Because, the result of the different order of summing maybe
37+
// different, for example, the result of `a+b+c+d` may be different with the
38+
// result of `c+a+b+d`.
39+
// For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
40+
// so the result of kAllReduce and kReduce maybe not equal.
41+
// For CPU, if you want to fix the order of summing to make the result
42+
// of kAllReduce and kReduce no diff, you can add
43+
// `FLAGS_cpu_deterministic=true` to env.
2444
enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
2545

2646
enum class GradientScaleStrategy {

paddle/fluid/framework/details/multi_devices_graph_builder.cc

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
275275
if (strategy_.gradient_scale_ !=
276276
BuildStrategy::GradientScaleStrategy::kCustomized) {
277277
// TODO(paddle-dev): Why is there no input for this op_handle?
278-
CreateScaleLossGradOp(&result);
278+
auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
279+
CreateScaleLossGradOp(&result, loss_grad_name);
279280
}
280281
// This assumes the backward generating code will ensure IsScaleLossOp
281282
// is true only for the op that scale the final scalar loss.
@@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
535536
return got == sharded_var_device.end() ? -1 : got->second;
536537
}
537538

538-
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
539+
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
540+
ir::Graph *result, const std::string &loss_grad_name) const {
539541
for (size_t i = 0; i < places_.size(); ++i) {
540542
// Insert ScaleCost OpHandle
541543
#ifdef PADDLE_WITH_CUDA
@@ -558,10 +560,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
558560
// loss->pending_ops_.emplace_back(op_handle);
559561
// op_handle->inputs_.emplace_back(loss);
560562

561-
CreateOpOutput(result, op_handle,
562-
result->CreateEmptyNode(GradVarName(loss_var_name_),
563-
ir::Node::Type::kVariable),
564-
places_[i], i);
563+
CreateOpOutput(
564+
result, op_handle,
565+
result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
566+
places_[i], i);
565567
}
566568
}
567569

0 commit comments

Comments
 (0)