Skip to content

Commit 216c621

Browse files
authored
Merge branch 'release/0.15.0' into 015_for_dqn_local
2 parents 84f3291 + ef9029d commit 216c621

33 files changed

+121
-338
lines changed

cmake/configure.cmake

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,12 @@ if(WITH_GPU)
103103
endif()
104104
if(WITH_ANAKIN)
105105
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
106-
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
106+
message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
107+
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
107108
endif()
108109
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
109-
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
110+
message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
111+
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
110112
endif()
111113
set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
112114
get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ endif()
6262

6363
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
6464
# compile the libinference_anakin_api.a and anakin.so.
65-
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
66-
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
65+
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
66+
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
6767
function(anakin_target target_name)
6868
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
6969
endfunction()
7070
anakin_target(inference_anakin_api)
71-
#anakin_target(inference_anakin_api_shared)
71+
anakin_target(inference_anakin_api_shared)
7272
if (WITH_TESTING)
7373
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
7474
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin

paddle/fluid/operators/activation_op.cu

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
2626
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
2727
ops::grad_functor<float>>, \
2828
ops::ActivationGradKernel<plat::CUDADeviceContext, \
29-
ops::grad_functor<double>>, \
30-
ops::ActivationGradKernel<plat::CUDADeviceContext, \
31-
ops::grad_functor<plat::float16>>);
29+
ops::grad_functor<double>>);
3230

3331
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);

paddle/fluid/operators/activation_op.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
333333
template <typename Device, typename X, typename Out, typename dOut,
334334
typename dX>
335335
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
336-
dx.device(d) = static_cast<T>(0.5) * dout / out;
336+
const Out out_conj = Eigen::numext::conj(out);
337+
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
337338
}
338339
};
339340

@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
739740
typename dX>
740741
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
741742
dx.device(d) = dout * static_cast<T>(factor) *
742-
x.pow(static_cast<T>(factor) - static_cast<T>(1));
743+
x.pow(static_cast<T>(factor - static_cast<T>(1)));
743744
}
744745
};
745746

@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
862863
template <typename Device, typename X, typename Out, typename dOut,
863864
typename dX>
864865
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
865-
T b = static_cast<T>(beta);
866866
auto temp1 = static_cast<T>(1) /
867-
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
868-
auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
869-
dx.device(d) = dout * ((b * out) + temp2);
867+
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
868+
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
869+
dx.device(d) = dout * ((beta * out) + temp2);
870870
}
871871
};
872872

paddle/fluid/operators/assign_value_op.cu.cc

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/assign_value_op.h"
16-
#include "paddle/fluid/platform/float16.h"
1716

1817
namespace ops = paddle::operators;
19-
namespace plat = paddle::platform;
2018
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
21-
ops::AssignValueKernel<float>,
22-
ops::AssignValueKernel<plat::float16>);
19+
ops::AssignValueKernel<float>);

paddle/fluid/operators/conv_cudnn_op.cu.cc

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
3939
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
4040
static_cast<size_t>(1024) * 1024 * 1024;
4141

42-
template <typename T, typename DeviceContext>
43-
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
44-
bool EnableFp16(const DeviceContext& dev_ctx,
45-
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
46-
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
47-
// Tensor core is supported since the volta GPU and
48-
// is only enabled when input and filter data are float16
49-
if (dev_ctx.GetComputeCapability() >= 70 &&
50-
std::type_index(typeid(T)) ==
51-
std::type_index(typeid(platform::float16))) {
52-
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
53-
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
54-
return true;
55-
} else {
56-
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
57-
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
58-
}
59-
#endif
60-
return false;
61-
}
62-
6342
template <typename T>
6443
class CUDNNConvOpKernel : public framework::OpKernel<T> {
6544
public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
149128
cudnnConvolutionFwdAlgo_t algo;
150129
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
151130
auto handle = dev_ctx.cudnn_handle();
152-
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
131+
132+
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
133+
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
134+
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
135+
workspace_size_limit, &algo));
136+
137+
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
138+
// Tensor core is supported since the volta GPU and
139+
// is only enabled when input and filter data are float16
140+
if (dev_ctx.GetComputeCapability() >= 70 &&
141+
std::type_index(typeid(T)) ==
142+
std::type_index(typeid(platform::float16))) {
143+
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
144+
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
145+
// Currently tensor core is only enabled using this algo
153146
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
154147
} else {
155-
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
156-
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
157-
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
158-
workspace_size_limit, &algo));
148+
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
149+
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
159150
}
151+
#endif
160152

161153
// get workspace size able to allocate
162154
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
296288
} else {
297289
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
298290
}
299-
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
300-
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
301-
}
302291

303292
CUDNN_ENFORCE(
304293
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
318307
} else {
319308
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
320309
}
321-
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
322-
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
323-
}
324310

325311
CUDNN_ENFORCE(
326312
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,14 +362,12 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
376362
paddle::operators::CUDNNConvOpKernel<plat::float16>);
377363
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
378364
paddle::operators::CUDNNConvGradOpKernel<float>,
379-
paddle::operators::CUDNNConvGradOpKernel<double>,
380-
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
365+
paddle::operators::CUDNNConvGradOpKernel<double>);
381366

382367
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
383368
paddle::operators::CUDNNConvOpKernel<float>,
384369
paddle::operators::CUDNNConvOpKernel<double>,
385370
paddle::operators::CUDNNConvOpKernel<plat::float16>);
386371
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
387372
paddle::operators::CUDNNConvGradOpKernel<float>,
388-
paddle::operators::CUDNNConvGradOpKernel<double>,
389-
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
373+
paddle::operators::CUDNNConvGradOpKernel<double>);

paddle/fluid/operators/cross_entropy_op.cu

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/cross_entropy_op.h"
16-
#include "paddle/fluid/platform/float16.h"
1716

1817
namespace ops = paddle::operators;
19-
namespace plat = paddle::platform;
2018
using CUDACtx = paddle::platform::CUDADeviceContext;
2119
REGISTER_OP_CUDA_KERNEL(cross_entropy,
2220
ops::CrossEntropyOpKernel<CUDACtx, float>,
23-
ops::CrossEntropyOpKernel<CUDACtx, double>,
24-
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
25-
REGISTER_OP_CUDA_KERNEL(
26-
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
27-
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
28-
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
21+
ops::CrossEntropyOpKernel<CUDACtx, double>);
22+
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
23+
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
24+
ops::CrossEntropyGradientOpKernel<CUDACtx, double>);

paddle/fluid/operators/elementwise_add_op.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
3030
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
3131
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
3232
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
33-
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
34-
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
33+
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);

paddle/fluid/operators/elementwise_div_op.cu

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,19 @@ limitations under the License. */
1414

1515
#define EIGEN_USE_GPU
1616
#include "paddle/fluid/operators/elementwise_div_op.h"
17-
#include "paddle/fluid/platform/float16.h"
1817

1918
namespace ops = paddle::operators;
20-
namespace plat = paddle::platform;
2119

2220
REGISTER_OP_CUDA_KERNEL(
2321
elementwise_div,
2422
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
2523
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
2624
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
27-
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
28-
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
29-
plat::float16>);
25+
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
3026
REGISTER_OP_CUDA_KERNEL(
3127
elementwise_div_grad,
3228
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
3329
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
3430
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
35-
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
3631
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
37-
plat::float16>);
32+
int64_t>);

paddle/fluid/operators/elementwise_mul_op.cu

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,19 @@ limitations under the License. */
1414

1515
#define EIGEN_USE_GPU
1616
#include "paddle/fluid/operators/elementwise_mul_op.h"
17-
#include "paddle/fluid/platform/float16.h"
1817

1918
namespace ops = paddle::operators;
20-
namespace plat = paddle::platform;
2119

2220
REGISTER_OP_CUDA_KERNEL(
2321
elementwise_mul,
2422
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
2523
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
2624
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
27-
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
28-
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
29-
plat::float16>);
25+
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
3026
REGISTER_OP_CUDA_KERNEL(
3127
elementwise_mul_grad,
3228
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
3329
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
3430
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
35-
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
36-
plat::float16>,
3731
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
3832
int64_t>);

0 commit comments

Comments
 (0)