Skip to content

Commit 3c3df1f

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop
2 parents 00b11c2 + 25262ed commit 3c3df1f

File tree

11 files changed

+101
-38
lines changed

11 files changed

+101
-38
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ option(WITH_INFERENCE "Compile fluid inference library" ON)
7272
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF)
7373
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
7474
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
75+
option(WITH_FAST_MATH "Make use of fast math library" OFF)
7576

7677
# PY_VERSION
7778
if(NOT PY_VERSION)

cmake/cuda.cmake

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,10 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
175175
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
176176
endif(NOT WIN32)
177177

178-
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
178+
if(WITH_FAST_MATH)
179+
# Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
180+
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
181+
endif()
179182
# in cuda9, suppress cuda warning on eigen
180183
list(APPEND CUDA_NVCC_FLAGS "-w")
181184
# Set :expt-relaxed-constexpr to suppress Eigen warnings

cmake/external/eigen.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@ INCLUDE(ExternalProject)
33
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
44
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
55
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
6+
if(NOT WITH_FAST_MATH)
7+
# EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
8+
# enables some optimizations which might affect the accuracy of the result.
9+
# This currently enables the SSE vectorization of sin() and cos(),
10+
# and speedups sqrt() for single precision.
11+
# Defined to 1 by default. Define it to 0 to disable.
12+
add_definitions(-DEIGEN_FAST_MATH=0)
13+
endif()
614

715
if(WITH_AMD_GPU)
816
ExternalProject_Add(

cmake/flags.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ if (APPLE)
157157
# On Mac OS X build fat binaries with x86_64 architectures by default.
158158
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
159159
endif()
160+
# On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
161+
set (COMMON_FLAGS -Wno-deprecated-register)
160162
endif(APPLE)
161163

162164
if(LINUX)

paddle/fluid/framework/rw_lock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ struct RWLock {
4646
private:
4747
pthread_rwlock_t lock_;
4848
};
49+
// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
4950
#else
5051
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
5152
// In windows, rw_lock seems like a hack. Use empty object and do nothing.

paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ void SetConfig(AnalysisConfig *cfg) {
2727
cfg->device = 0;
2828
cfg->enable_ir_optim = true;
2929
cfg->specify_input_name = true;
30+
#ifdef PADDLE_WITH_MKLDNN
31+
cfg->_use_mkldnn = true;
32+
#endif
3033
}
3134

3235
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

paddle/fluid/operators/top_k_op.cu

Lines changed: 64 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
256256
* 3. go to the second setp, until one thread's topk value is null;
257257
* 4. go to the first setp, until get the topk value.
258258
*/
259+
259260
template <typename T, int MaxLength, int BlockSize>
260261
__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
261-
const T* src, int lds, int dim, int k) {
262+
const T* src, int lds, int dim, int k,
263+
int grid_dim, int num) {
262264
__shared__ Pair<T> sh_topk[BlockSize];
263265
__shared__ int maxid[BlockSize / 2];
264266
const int tid = threadIdx.x;
265267
const int warp = threadIdx.x / 32;
266-
output += blockIdx.x * output_stride;
267-
indices += blockIdx.x * k;
268268

269-
Pair<T> topk[MaxLength];
270-
int beam = MaxLength;
271-
Pair<T> max;
272-
bool is_empty = false;
273-
bool firststep = true;
269+
const int bid = blockIdx.x;
270+
for (int i = bid; i < num; i += grid_dim) {
271+
output += i * output_stride;
272+
indices += i * k;
273+
274+
Pair<T> topk[MaxLength];
275+
int beam = MaxLength;
276+
Pair<T> max;
277+
bool is_empty = false;
278+
bool firststep = true;
279+
280+
for (int k = 0; k < MaxLength; k++) {
281+
topk[k].set(-INFINITY, -1);
282+
}
283+
while (k) {
284+
ThreadGetTopK<T, MaxLength, BlockSize>(
285+
topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
274286

275-
for (int k = 0; k < MaxLength; k++) {
276-
topk[k].set(-INFINITY, -1);
287+
sh_topk[tid] = topk[0];
288+
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
289+
&indices, &beam, &k, tid, warp);
290+
}
277291
}
278-
while (k) {
279-
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
280-
src + blockIdx.x * lds, &firststep,
281-
&is_empty, &max, dim, tid);
282-
283-
sh_topk[tid] = topk[0];
284-
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
285-
&indices, &beam, &k, tid, warp);
292+
}
293+
294+
inline static int GetDesiredBlockDim(int dim) {
295+
if (dim > 128) {
296+
return 256;
297+
} else if (dim > 64) {
298+
return 128;
299+
} else if (dim > 32) {
300+
return 64;
301+
} else {
302+
return 32;
286303
}
287304
}
288305

306+
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
307+
case (dim): { \
308+
constexpr auto kBlockDim = (dim); \
309+
__VA_ARGS__; \
310+
} break
311+
312+
#define FIXED_BLOCK_DIM(...) \
313+
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
314+
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
315+
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
316+
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
317+
289318
template <typename T>
290319
class TopkOpCUDAKernel : public framework::OpKernel<T> {
291320
public:
@@ -310,18 +339,26 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
310339
// NOTE: pass lds and dim same to input width.
311340
// NOTE: old matrix implementation of stride is different to eigen.
312341
// TODO(typhoonzero): refine this kernel.
313-
dim3 threads(256, 1);
314-
dim3 grid(input_height, 1);
315-
316-
KeMatrixTopK<T, 5, 256><<<
317-
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
318-
ctx.device_context())
319-
.stream()>>>(
320-
output_data, output->dims()[1], indices_data, input_data, input_width,
321-
input_width, static_cast<int>(k));
342+
const int kMaxHeight = 2048;
343+
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
344+
auto& dev_ctx = ctx.cuda_device_context();
345+
346+
switch (GetDesiredBlockDim(input_width)) {
347+
FIXED_BLOCK_DIM(
348+
KeMatrixTopK<T, 5,
349+
kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
350+
output_data, output->dims()[1], indices_data, input_data,
351+
input_width, input_width, static_cast<int>(k), gridx,
352+
input_height));
353+
default:
354+
PADDLE_THROW("Error");
355+
}
322356
}
323357
};
324358

359+
#undef FIXED_BLOCK_DIM_BASE
360+
#undef FIXED_BLOCK_DIM
361+
325362
} // namespace operators
326363
} // namespace paddle
327364

paddle/fluid/operators/while_op.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -224,10 +224,12 @@ class WhileGradOp : public framework::OperatorBase {
224224
if (cur_scope_iter == step_scopes->rbegin()) {
225225
auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
226226
PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
227-
PADDLE_ENFORCE(var->IsType<framework::LoDTensorArray>() ||
228-
var->IsType<LoDTensor>(),
229-
"Currently the type of var only can be LoDTensorArray "
230-
"or LoDTensor.");
227+
PADDLE_ENFORCE(
228+
var->IsType<framework::LoDTensorArray>() ||
229+
var->IsType<LoDTensor>(),
230+
"Currently the type of var only can be LoDTensorArray, "
231+
"or LoDTensor, but the received var[%s] is %s.",
232+
inside_grad_name, var->Type().name());
231233

232234
if (var->IsType<LoDTensor>()) {
233235
auto &inside_tensor = var->Get<framework::LoDTensor>();

paddle/fluid/platform/gpu_info.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@ limitations under the License. */
2020
#include "paddle/fluid/platform/enforce.h"
2121

2222
DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
23-
"Default use 92% of GPU memory for PaddlePaddle,"
24-
"reserve the rest for page tables, etc");
23+
"Allocate a trunk of gpu memory that is this fraction of the "
24+
"total gpu memory size. Future memory usage will be allocated "
25+
"from the trunk. If the trunk doesn't have enough gpu memory, "
26+
"additional trunks of the same size will be requested from gpu "
27+
"until the gpu has no memory left for another trunk.");
2528

2629
namespace paddle {
2730
namespace platform {

paddle/scripts/paddle_build.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ EOF
598598
EOF
599599

600600
if [[ ${WITH_GPU} == "ON" ]]; then
601-
NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
601+
NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true"
602602
else
603603
NCCL_DEPS=""
604604
fi
@@ -614,9 +614,8 @@ EOF
614614
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
615615
ADD python/dist/*.whl /
616616
# run paddle version to install python packages first
617-
RUN apt-get update &&\
618-
${NCCL_DEPS}\
619-
apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
617+
RUN apt-get update && ${NCCL_DEPS}
618+
RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
620619
pip install /*.whl; apt-get install -f -y && \
621620
apt-get clean -y && \
622621
rm -f /*.whl && \

0 commit comments

Comments
 (0)