PaddlePaddle
diff --git a/‎paddle/fluid/operators/activation_op.cu
Lines changed: 1 addition & 3 deletions b/‎paddle/fluid/operators/activation_op.cu
Lines changed: 1 addition & 3 deletions
diff --git a/‎paddle/fluid/operators/activation_op.h
Lines changed: 6 additions & 6 deletions b/‎paddle/fluid/operators/activation_op.h
Lines changed: 6 additions & 6 deletions
diff --git a/‎paddle/fluid/operators/assign_value_op.cu.cc
Lines changed: 1 addition & 4 deletions b/‎paddle/fluid/operators/assign_value_op.cu.cc
Lines changed: 1 addition & 4 deletions
diff --git a/‎paddle/fluid/operators/conv_cudnn_op.cu.cc
Lines changed: 20 additions & 36 deletions b/‎paddle/fluid/operators/conv_cudnn_op.cu.cc
Lines changed: 20 additions & 36 deletions
diff --git a/‎paddle/fluid/operators/cross_entropy_op.cu
Lines changed: 4 additions & 8 deletions b/‎paddle/fluid/operators/cross_entropy_op.cu
Lines changed: 4 additions & 8 deletions
diff --git a/‎paddle/fluid/operators/elementwise_add_op.cu
Lines changed: 1 addition & 2 deletions b/‎paddle/fluid/operators/elementwise_add_op.cu
Lines changed: 1 addition & 2 deletions
diff --git a/‎paddle/fluid/operators/elementwise_div_op.cu
Lines changed: 2 additions & 7 deletions b/‎paddle/fluid/operators/elementwise_div_op.cu
Lines changed: 2 additions & 7 deletions
diff --git a/‎paddle/fluid/operators/elementwise_mul_op.cu
Lines changed: 1 addition & 7 deletions b/‎paddle/fluid/operators/elementwise_mul_op.cu
Lines changed: 1 addition & 7 deletions
diff --git a/‎paddle/fluid/operators/elementwise_op_function.h
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/operators/elementwise_op_function.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/elementwise_sub_op.cu
Lines changed: 1 addition & 7 deletions b/‎paddle/fluid/operators/elementwise_sub_op.cu
Lines changed: 1 addition & 7 deletions
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
       act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                  ops::grad_functor<float>>, \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
+                                ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
   }
 };
 
@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
   }
 };
 
@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    T b = static_cast<T>(beta);
     auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((b * out) + temp2);
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
   }
 };
 
 
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>,
-                        ops::AssignValueKernel<plat::float16>);
+                        ops::AssignValueKernel<float>);
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-template <typename T, typename DeviceContext>
-// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
-bool EnableFp16(const DeviceContext& dev_ctx,
-                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  // Tensor core is supported since the volta GPU and
-  // is only enabled when input and filter data are float16
-  if (dev_ctx.GetComputeCapability() >= 70 &&
-      std::type_index(typeid(T)) ==
-          std::type_index(typeid(platform::float16))) {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
-    return true;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-  }
-#endif
-  return false;
-}
-
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionFwdAlgo_t algo;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
+#endif
 
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,14 +362,12 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
                    paddle::operators::CUDNNConvOpKernel<double>,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                         ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
@@ -14,24 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>);
+                                  int64_t>);
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   do {
     int x_offset = i * w + j;
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
-  T val(0);
+  T val = 0;
   int ttid = tid;
 
   while (true) {
 
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);