Skip to content

Commit cdb315e

Browse files
Merge branch 'develop' into docrefine
2 parents 032ea9c + 0fff666 commit cdb315e

File tree

541 files changed

+27791
-7064
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

541 files changed

+27791
-7064
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
2424
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
2525
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
2626
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
27+
message(STATUS "AR tools: ${CMAKE_AR}")
28+
2729
if(WIN32)
2830
set(CMAKE_SUPPRESS_REGENERATION ON)
2931
set(CMAKE_STATIC_LIBRARY_PREFIX lib)

Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
7575
# and its size is only one-third of the official one.
7676
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
7777
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
78-
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
79-
tar -xz -C /usr/local && \
78+
79+
RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
80+
tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
8081
cp -rf /usr/local/TensorRT/include /usr && \
8182
cp -rf /usr/local/TensorRT/lib /usr
8283

benchmark/fluid/fluid_benchmark.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
179179
else:
180180
build_strategy.reduce_strategy = fluid.BuildStrategy(
181181
).ReduceStrategy.AllReduce
182-
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
183182

184183
avg_loss = train_args[0]
185184

cmake/external/boost.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
2424
# So we use 1.41.0 here.
2525
set(BOOST_VER "1.41.0")
2626
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
27-
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
27+
set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
2828

2929
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
3030

cmake/external/grpc.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ ExternalProject_Add(
4444
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
4545
# checkout and clean other dirs under third_party
4646
# 4. remove .git, and package the directory.
47-
URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
47+
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
4848
URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
4949
PREFIX ${GRPC_SOURCES_DIR}
5050
UPDATE_COMMAND ""

cmake/external/mkldnn.cmake

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,17 @@ IF(APPLE)
3131
return()
3232
ENDIF()
3333

34-
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
34+
# Introduce variables:
35+
# * CMAKE_INSTALL_LIBDIR
36+
INCLUDE(GNUInstallDirs)
37+
SET(LIBDIR "lib")
38+
if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
39+
SET(LIBDIR "lib64")
40+
endif()
41+
42+
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path")
3543
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
36-
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
44+
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
3745

3846
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
3947

@@ -58,7 +66,7 @@ ExternalProject_Add(
5866
${EXTERNAL_PROJECT_LOG_ARGS}
5967
DEPENDS ${MKLDNN_DEPENDS}
6068
GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git"
61-
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a"
69+
GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc"
6270
PREFIX ${MKLDNN_SOURCES_DIR}
6371
UPDATE_COMMAND ""
6472
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -79,9 +87,9 @@ ExternalProject_Add(
7987
-DMKLROOT:PATH=${MKLML_ROOT}
8088
)
8189
if(WIN32)
82-
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
90+
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
8391
else(WIN32)
84-
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
92+
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
8593
endif(WIN32)
8694

8795
ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
@@ -101,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
101109
# copy the real so.0 lib to install dir
102110
# it can be directly contained in wheel or capi
103111
if(WIN32)
104-
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
112+
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
105113
else(WIN32)
106114
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
107115
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}

cmake/external/mklml.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
3434
SET(TIME_VERSION "2019.0.1.20181227")
3535
IF(WIN32)
3636
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
37-
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
37+
SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
3838
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
3939
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
4040
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
@@ -43,7 +43,7 @@ ELSE()
4343
#TODO(intel-huying):
4444
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
4545
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
46-
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
46+
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
4747
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
4848
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
4949
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)

cmake/operators.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ function(op_library TARGET)
110110
# Define operators that don't need pybind here.
111111
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
112112
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
113-
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
113+
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
114114
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
115115
set(pybind_flag 1)
116116
endif()

paddle/contrib/float16/README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ Kexin Zhao <[email protected]>
55
## Introduction
66
Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
77

8-
This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
8+
This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
99

1010

1111
## What is float16?
1212
float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
1313

14-
Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
14+
Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
1515

1616
## Why float16?
1717
The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
@@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model,
2424

2525
## Fluid implementation of float16 inference
2626
### Overview
27-
Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
27+
Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
2828

2929
### Basic requirement
3030
When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
3131

32-
If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.
32+
If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.
3333

3434
The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
3535

@@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate
7575

7676
We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
7777

78-
The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
78+
The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
7979

8080
### Experiment results
8181
Simply running the following commands to reproduce the experiment results presented in this section:
@@ -113,7 +113,7 @@ We repeat the test ten times and get the following results:
113113
| #10 | 62.53% | 62.48% |
114114
| average| 62.63% | 62.62% |
115115

116-
We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.
116+
We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.
117117

118118
#### Performance benchmark
119119
Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart.
@@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data
132132
|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 |
133133
|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 |
134134

135-
We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.
135+
We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.
136136

137137
Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
138138

@@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a
162162

163163
We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
164164

165-
Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results.
165+
Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results.
166166

167167
### Summary
168168
1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.

0 commit comments

Comments
 (0)