PaddlePaddle
diff --git a/‎paddle/fluid/framework/threadpool.cc
Lines changed: 18 additions & 12 deletions b/‎paddle/fluid/framework/threadpool.cc
Lines changed: 18 additions & 12 deletions
diff --git a/‎paddle/fluid/framework/threadpool.h
Lines changed: 8 additions & 3 deletions b/‎paddle/fluid/framework/threadpool.h
Lines changed: 8 additions & 3 deletions
diff --git a/‎paddle/fluid/operators/activation_op.cu
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/operators/activation_op.cu
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/activation_op.h
Lines changed: 2 additions & 3 deletions b/‎paddle/fluid/operators/activation_op.h
Lines changed: 2 additions & 3 deletions
diff --git a/‎paddle/fluid/operators/batch_norm_op.cu.cc
Lines changed: 12 additions & 9 deletions b/‎paddle/fluid/operators/batch_norm_op.cu.cc
Lines changed: 12 additions & 9 deletions
diff --git a/‎paddle/fluid/operators/conv_cudnn_op.cu.cc
Lines changed: 4 additions & 1 deletion b/‎paddle/fluid/operators/conv_cudnn_op.cu.cc
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/cross_entropy_op.cu
Lines changed: 9 additions & 4 deletions b/‎paddle/fluid/operators/cross_entropy_op.cu
Lines changed: 9 additions & 4 deletions
diff --git a/‎paddle/fluid/operators/elementwise_add_op.cu
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/operators/elementwise_add_op.cu
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/elementwise_op_function.h
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/operators/elementwise_op_function.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/math/cross_entropy.cu
Lines changed: 16 additions & 6 deletions b/‎paddle/fluid/operators/math/cross_entropy.cu
Lines changed: 16 additions & 6 deletions
@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
 ThreadPool::~ThreadPool() {
   {
     // notify all threads to stop running
-    std::lock_guard<std::mutex> l(mutex_);
+    std::unique_lock<std::mutex> l(mutex_);
     running_ = false;
-    scheduled_.notify_all();
   }
+  scheduled_.notify_all();
 
   for (auto& t : threads_) {
     t->join();
@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
 
 void ThreadPool::TaskLoop() {
   while (true) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    Task task;
 
-    scheduled_.wait(
-        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      scheduled_.wait(
+          lock, [this] { return !this->tasks_.empty() || !this->running_; });
 
-    if (!running_ || tasks_.empty()) {
-      return;
-    }
+      if (!running_ && tasks_.empty()) {
+        return;
+      }
+
+      if (tasks_.empty()) {
+        PADDLE_THROW("This thread has no task to Run");
+      }
 
-    // pop a task from the task queue
-    auto task = std::move(tasks_.front());
-    tasks_.pop();
-    lock.unlock();
+      // pop a task from the task queue
+      task = std::move(tasks_.front());
+      tasks_.pop();
+    }
 
     // run the task
     task();
 
@@ -58,7 +58,7 @@ class ThreadPool {
   ~ThreadPool();
 
   // Run pushes a function to the task queue and returns a std::future
-  // object.  To wait for the completion of the task, call
+  // object. To wait for the completion of the task, call
   // std::future::wait().
   template <typename Callback>
   std::future<void> Run(Callback fn) {
@@ -69,7 +69,6 @@ class ThreadPool {
   template <typename Callback>
   std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
       Callback fn) {
-    std::unique_lock<std::mutex> lock(mutex_);
     Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
       try {
         fn();
@@ -84,7 +83,13 @@ class ThreadPool {
       return nullptr;
     });
     std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
-    tasks_.push(std::move(task));
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (!running_) {
+        PADDLE_THROW("enqueue on stopped ThreadPool");
+      }
+      tasks_.push(std::move(task));
+    }
     scheduled_.notify_one();
     return f;
   }
 
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
       act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                  ops::grad_functor<float>>, \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>);
+                                ops::grad_functor<double>>,                 \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
+                                ops::grad_functor<plat::float16>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    const Out out_conj = Eigen::numext::conj(out);
-    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
+    dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
 };
 
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
 };
 
 
@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     if ((N * H * W * D) == 1) {
@@ -272,19 +272,21 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data = saved_mean->template data<T>();
-    const void *saved_var_data = saved_var->template data<T>();
+    const void *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const void *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
 
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
         data_desc_, d_y->template data<T>(), data_desc_,
         d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<T>(),
-        d_scale->template mutable_data<T>(ctx.GetPlace()),
-        d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
-        saved_mean_data, saved_var_data));
+        scale->template data<BatchNormParamType<T>>(),
+        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+        epsilon, saved_mean_data, saved_var_data));
 
     // clean when exit.
     CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      VLOG(5) << "use cudnn_tensor_op_math";
     } else {
       CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+      VLOG(5) << "NOT use cudnn_tensor_op_math";
     }
 #endif
 
@@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
 
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/platform/float16.h"
 
+namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                         ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>);
-REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>,
+                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
+    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
@@ -365,7 +365,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
-  T val = 0;
+  T val(0);
 
   do {
     int x_offset = i * w + j;
@@ -433,7 +433,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
-  T val = 0;
+  T val(0);
   int ttid = tid;
 
   while (true) {
 
@@ -21,6 +21,16 @@ namespace operators {
 namespace math {
 
 namespace {
+
+__device__ __forceinline__ float real_log(float x) { return logf(x); }
+
+__device__ __forceinline__ double real_log(double x) { return log(x); }
+
+__device__ __forceinline__ platform::float16 real_log(
+    const platform::float16& val) {
+  return static_cast<platform::float16>(hlog(static_cast<half>(val)));
+}
+
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int N, const int D,
@@ -29,21 +39,21 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
        i += blockDim.x * gridDim.x) {
     PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
     Y[i] = ignore_index == label[i]
-               ? 0
-               : -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+               ? static_cast<T>(0)
+               : -math::TolerableValue<T>()(real_log(X[i * D + label[i]]));
   }
 }
 
 template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int class_num) {
   int tid = threadIdx.x;
-  T val = 0;
+  T val(0);
 
   int idx = blockIdx.x * class_num + tid;
   int end = blockIdx.x * class_num + class_num;
   for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(real_log(X[idx])) * label[idx];
   }
 
   val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
 }
 }  // namespace
 
-using Tensor = framework::Tensor;
-
 template <typename T>
 class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext,
+                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle