Skip to content

Commit 32ca9fe

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into change_manylinux1_Docker
2 parents ba84a6b + d7873e1 commit 32ca9fe

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+593
-222
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
55
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
66
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
77
*.DS_Store
8+
*.vs
89
build/
910
build_doc/
1011
*.user
@@ -15,6 +16,7 @@ build_doc/
1516
.cproject
1617
.pydevproject
1718
.settings/
19+
CMakeSettings.json
1820
Makefile
1921
.test_env/
2022
third_party/

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,12 @@ include(external/snappy) # download snappy
204204
include(external/snappystream)
205205
include(external/threadpool)
206206

207-
set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
208207
if(WITH_GPU)
209208
include(cuda)
210209
include(tensorrt)
211210
include(external/anakin)
211+
elseif()
212+
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
212213
endif()
213214

214215
include(cudnn) # set cudnn libraries, must before configure

cmake/configure.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ if(NOT CMAKE_CROSSCOMPILING)
5656
set(SIMD_FLAG ${SSE3_FLAG})
5757
endif()
5858
endif()
59+
if(UNIX AND NOT APPLE)
60+
# except apple from nix*Os family
61+
set(LINUX TRUE)
62+
endif(UNIX AND NOT APPLE)
5963

6064
if(NOT WITH_GOLANG)
6165
add_definitions(-DPADDLE_WITHOUT_GOLANG)
@@ -104,6 +108,10 @@ if(WITH_GPU)
104108
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
105109
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
106110
endif()
111+
set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
112+
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
113+
message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
114+
message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
107115
endif()
108116
elseif(WITH_AMD_GPU)
109117
add_definitions(-DPADDLE_WITH_HIP)

cmake/external/anakin.cmake

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
3535
ExternalProject_Add(
3636
extern_anakin
3737
${EXTERNAL_PROJECT_LOG_ARGS}
38-
# TODO(luotao): use PaddlePaddle/Anakin later
39-
GIT_REPOSITORY "https://github.com/luotao1/Anakin"
40-
GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be"
38+
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
39+
GIT_TAG "04256ba78fa3da0beb74e8036c8efd68c12824d6"
4140
PREFIX ${ANAKIN_SOURCE_DIR}
4241
UPDATE_COMMAND ""
4342
CMAKE_ARGS -DUSE_GPU_PLACE=YES

paddle/fluid/API.spec

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
155155
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
156156
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
157157
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
158-
paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
159-
paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
158+
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
159+
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
160160
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
161161
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
162+
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
162163
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
163164
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
164165
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
6060
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
6161
endif()
6262

63-
if (WITH_ANAKIN) # only needed in CI
63+
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
6464
# compile the libinference_anakin_api.a and anakin.so.
6565
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
6666
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)

paddle/fluid/operators/activation_op.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
2626
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
2727
ops::grad_functor<float>>, \
2828
ops::ActivationGradKernel<plat::CUDADeviceContext, \
29-
ops::grad_functor<double>>);
29+
ops::grad_functor<double>>, \
30+
ops::ActivationGradKernel<plat::CUDADeviceContext, \
31+
ops::grad_functor<plat::float16>>);
3032

3133
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);

paddle/fluid/operators/activation_op.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
333333
template <typename Device, typename X, typename Out, typename dOut,
334334
typename dX>
335335
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
336-
const Out out_conj = Eigen::numext::conj(out);
337-
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
336+
dx.device(d) = static_cast<T>(0.5) * dout / out;
338337
}
339338
};
340339

@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
740739
typename dX>
741740
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
742741
dx.device(d) = dout * static_cast<T>(factor) *
743-
x.pow(static_cast<T>(factor - static_cast<T>(1)));
742+
x.pow(static_cast<T>(factor) - static_cast<T>(1));
744743
}
745744
};
746745

@@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
863862
template <typename Device, typename X, typename Out, typename dOut,
864863
typename dX>
865864
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
865+
T b = static_cast<T>(beta);
866866
auto temp1 = static_cast<T>(1) /
867-
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
868-
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
869-
dx.device(d) = dout * ((beta * out) + temp2);
867+
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
868+
auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
869+
dx.device(d) = dout * ((b * out) + temp2);
870870
}
871871
};
872872

paddle/fluid/operators/assign_value_op.cu.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/assign_value_op.h"
16+
#include "paddle/fluid/platform/float16.h"
1617

1718
namespace ops = paddle::operators;
19+
namespace plat = paddle::platform;
1820
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
19-
ops::AssignValueKernel<float>);
21+
ops::AssignValueKernel<float>,
22+
ops::AssignValueKernel<plat::float16>);

paddle/fluid/operators/conv_cudnn_op.cu.cc

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
3939
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
4040
static_cast<size_t>(1024) * 1024 * 1024;
4141

42+
template <typename T, typename DeviceContext>
43+
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
44+
bool EnableFp16(const DeviceContext& dev_ctx,
45+
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
46+
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
47+
// Tensor core is supported since the volta GPU and
48+
// is only enabled when input and filter data are float16
49+
if (dev_ctx.GetComputeCapability() >= 70 &&
50+
std::type_index(typeid(T)) ==
51+
std::type_index(typeid(platform::float16))) {
52+
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
53+
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
54+
return true;
55+
} else {
56+
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
57+
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
58+
}
59+
#endif
60+
return false;
61+
}
62+
4263
template <typename T>
4364
class CUDNNConvOpKernel : public framework::OpKernel<T> {
4465
public:
@@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
128149
cudnnConvolutionFwdAlgo_t algo;
129150
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
130151
auto handle = dev_ctx.cudnn_handle();
131-
132-
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
133-
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
134-
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
135-
workspace_size_limit, &algo));
136-
137-
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
138-
// Tensor core is supported since the volta GPU and
139-
// is only enabled when input and filter data are float16
140-
if (dev_ctx.GetComputeCapability() >= 70 &&
141-
std::type_index(typeid(T)) ==
142-
std::type_index(typeid(platform::float16))) {
143-
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
144-
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
145-
// Currently tensor core is only enabled using this algo
152+
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
146153
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
147154
} else {
148-
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
149-
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
155+
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
156+
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
157+
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
158+
workspace_size_limit, &algo));
150159
}
151-
#endif
152160

153161
// get workspace size able to allocate
154162
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
288296
} else {
289297
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
290298
}
299+
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
300+
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
301+
}
291302

292303
CUDNN_ENFORCE(
293304
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
307318
} else {
308319
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
309320
}
321+
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
322+
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
323+
}
310324

311325
CUDNN_ENFORCE(
312326
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -362,12 +376,14 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
362376
paddle::operators::CUDNNConvOpKernel<plat::float16>);
363377
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
364378
paddle::operators::CUDNNConvGradOpKernel<float>,
365-
paddle::operators::CUDNNConvGradOpKernel<double>);
379+
paddle::operators::CUDNNConvGradOpKernel<double>,
380+
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
366381

367382
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
368383
paddle::operators::CUDNNConvOpKernel<float>,
369384
paddle::operators::CUDNNConvOpKernel<double>,
370385
paddle::operators::CUDNNConvOpKernel<plat::float16>);
371386
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
372387
paddle::operators::CUDNNConvGradOpKernel<float>,
373-
paddle::operators::CUDNNConvGradOpKernel<double>);
388+
paddle::operators::CUDNNConvGradOpKernel<double>,
389+
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)

0 commit comments

Comments
 (0)