Skip to content

Commit 25e070e

Browse files
committed
Merge remote-tracking branch 'ups/develop' into fea/jit/vadd
2 parents cb4083b + ea8984c commit 25e070e

35 files changed

+800
-981
lines changed

paddle/fluid/framework/threadpool.cc

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
5757
ThreadPool::~ThreadPool() {
5858
{
5959
// notify all threads to stop running
60-
std::lock_guard<std::mutex> l(mutex_);
60+
std::unique_lock<std::mutex> l(mutex_);
6161
running_ = false;
62-
scheduled_.notify_all();
6362
}
63+
scheduled_.notify_all();
6464

6565
for (auto& t : threads_) {
6666
t->join();
@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
7070

7171
void ThreadPool::TaskLoop() {
7272
while (true) {
73-
std::unique_lock<std::mutex> lock(mutex_);
73+
Task task;
7474

75-
scheduled_.wait(
76-
lock, [this] { return !this->tasks_.empty() || !this->running_; });
75+
{
76+
std::unique_lock<std::mutex> lock(mutex_);
77+
scheduled_.wait(
78+
lock, [this] { return !this->tasks_.empty() || !this->running_; });
7779

78-
if (!running_ || tasks_.empty()) {
79-
return;
80-
}
80+
if (!running_ && tasks_.empty()) {
81+
return;
82+
}
83+
84+
if (tasks_.empty()) {
85+
PADDLE_THROW("This thread has no task to Run");
86+
}
8187

82-
// pop a task from the task queue
83-
auto task = std::move(tasks_.front());
84-
tasks_.pop();
85-
lock.unlock();
88+
// pop a task from the task queue
89+
task = std::move(tasks_.front());
90+
tasks_.pop();
91+
}
8692

8793
// run the task
8894
task();

paddle/fluid/framework/threadpool.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class ThreadPool {
5858
~ThreadPool();
5959

6060
// Run pushes a function to the task queue and returns a std::future
61-
// object. To wait for the completion of the task, call
61+
// object. To wait for the completion of the task, call
6262
// std::future::wait().
6363
template <typename Callback>
6464
std::future<void> Run(Callback fn) {
@@ -69,7 +69,6 @@ class ThreadPool {
6969
template <typename Callback>
7070
std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
7171
Callback fn) {
72-
std::unique_lock<std::mutex> lock(mutex_);
7372
Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
7473
try {
7574
fn();
@@ -84,7 +83,13 @@ class ThreadPool {
8483
return nullptr;
8584
});
8685
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
87-
tasks_.push(std::move(task));
86+
{
87+
std::unique_lock<std::mutex> lock(mutex_);
88+
if (!running_) {
89+
PADDLE_THROW("enqueue on stopped ThreadPool");
90+
}
91+
tasks_.push(std::move(task));
92+
}
8893
scheduled_.notify_one();
8994
return f;
9095
}

paddle/fluid/operators/activation_op.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
2626
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
2727
ops::grad_functor<float>>, \
2828
ops::ActivationGradKernel<plat::CUDADeviceContext, \
29-
ops::grad_functor<double>>);
29+
ops::grad_functor<double>>, \
30+
ops::ActivationGradKernel<plat::CUDADeviceContext, \
31+
ops::grad_functor<plat::float16>>);
3032

3133
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);

paddle/fluid/operators/activation_op.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
333333
template <typename Device, typename X, typename Out, typename dOut,
334334
typename dX>
335335
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
336-
const Out out_conj = Eigen::numext::conj(out);
337-
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
336+
dx.device(d) = static_cast<T>(0.5) * dout / out;
338337
}
339338
};
340339

@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
740739
typename dX>
741740
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
742741
dx.device(d) = dout * static_cast<T>(factor) *
743-
x.pow(static_cast<T>(factor - static_cast<T>(1)));
742+
x.pow(static_cast<T>(factor) - static_cast<T>(1));
744743
}
745744
};
746745

paddle/fluid/operators/batch_norm_op.cu.cc

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
219219
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
220220

221221
d_x->mutable_data<T>(ctx.GetPlace());
222-
d_scale->mutable_data<T>(ctx.GetPlace());
223-
d_bias->mutable_data<T>(ctx.GetPlace());
222+
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
223+
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
224224

225225
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
226226
if ((N * H * W * D) == 1) {
@@ -272,19 +272,21 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
272272

273273
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
274274
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
275-
const void *saved_mean_data = saved_mean->template data<T>();
276-
const void *saved_var_data = saved_var->template data<T>();
275+
const void *saved_mean_data =
276+
saved_mean->template data<BatchNormParamType<T>>();
277+
const void *saved_var_data =
278+
saved_var->template data<BatchNormParamType<T>>();
277279

278280
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
279281
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
280282
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
281283
CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
282284
data_desc_, d_y->template data<T>(), data_desc_,
283285
d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
284-
scale->template data<T>(),
285-
d_scale->template mutable_data<T>(ctx.GetPlace()),
286-
d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
287-
saved_mean_data, saved_var_data));
286+
scale->template data<BatchNormParamType<T>>(),
287+
d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
288+
d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
289+
epsilon, saved_mean_data, saved_var_data));
288290

289291
// clean when exit.
290292
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
304306
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
305307
REGISTER_OP_CUDA_KERNEL(
306308
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
307-
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
309+
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
310+
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);

paddle/fluid/operators/conv_cudnn_op.cu.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
143143
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
144144
// Currently tensor core is only enabled using this algo
145145
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
146+
VLOG(5) << "use cudnn_tensor_op_math";
146147
} else {
147148
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
148149
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
150+
VLOG(5) << "NOT use cudnn_tensor_op_math";
149151
}
150152
#endif
151153

@@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
361363
paddle::operators::CUDNNConvOpKernel<plat::float16>);
362364
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
363365
paddle::operators::CUDNNConvGradOpKernel<float>,
364-
paddle::operators::CUDNNConvGradOpKernel<double>);
366+
paddle::operators::CUDNNConvGradOpKernel<double>,
367+
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
365368

366369
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
367370
paddle::operators::CUDNNConvOpKernel<float>,

paddle/fluid/operators/cross_entropy_op.cu

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/cross_entropy_op.h"
16+
#include "paddle/fluid/platform/float16.h"
1617

18+
namespace plat = paddle::platform;
1719
namespace ops = paddle::operators;
1820
using CUDACtx = paddle::platform::CUDADeviceContext;
1921
REGISTER_OP_CUDA_KERNEL(cross_entropy,
2022
ops::CrossEntropyOpKernel<CUDACtx, float>,
21-
ops::CrossEntropyOpKernel<CUDACtx, double>);
22-
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
23-
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
24-
ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
23+
ops::CrossEntropyOpKernel<CUDACtx, double>,
24+
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
25+
26+
REGISTER_OP_CUDA_KERNEL(
27+
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
28+
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
29+
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);

paddle/fluid/operators/elementwise_add_op.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
3030
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
3131
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
3232
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
33-
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
33+
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
34+
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);

paddle/fluid/operators/elementwise_op_function.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
365365
int j = blockIdx.x;
366366
int i = threadIdx.x;
367367
int tid = threadIdx.x;
368-
T val = 0;
368+
T val(0);
369369

370370
do {
371371
int x_offset = i * w + j;
@@ -433,7 +433,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
433433
int tid = threadIdx.x;
434434
int j = blockIdx.x;
435435

436-
T val = 0;
436+
T val(0);
437437
int ttid = tid;
438438

439439
while (true) {

paddle/fluid/operators/math/cross_entropy.cu

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,16 @@ namespace operators {
2121
namespace math {
2222

2323
namespace {
24+
25+
__device__ __forceinline__ float real_log(float x) { return logf(x); }
26+
27+
__device__ __forceinline__ double real_log(double x) { return log(x); }
28+
29+
__device__ __forceinline__ platform::float16 real_log(
30+
const platform::float16& val) {
31+
return static_cast<platform::float16>(hlog(static_cast<half>(val)));
32+
}
33+
2434
template <typename T>
2535
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
2636
const int N, const int D,
@@ -29,21 +39,21 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
2939
i += blockDim.x * gridDim.x) {
3040
PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
3141
Y[i] = ignore_index == label[i]
32-
? 0
33-
: -math::TolerableValue<T>()(log(X[i * D + label[i]]));
42+
? static_cast<T>(0)
43+
: -math::TolerableValue<T>()(real_log(X[i * D + label[i]]));
3444
}
3545
}
3646

3747
template <typename T>
3848
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
3949
const int class_num) {
4050
int tid = threadIdx.x;
41-
T val = 0;
51+
T val(0);
4252

4353
int idx = blockIdx.x * class_num + tid;
4454
int end = blockIdx.x * class_num + class_num;
4555
for (; idx < end; idx += blockDim.x) {
46-
val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
56+
val += math::TolerableValue<T>()(real_log(X[idx])) * label[idx];
4757
}
4858

4959
val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
5363
}
5464
} // namespace
5565

56-
using Tensor = framework::Tensor;
57-
5866
template <typename T>
5967
class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
6068
public:
@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
8997

9098
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
9199
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
100+
template class CrossEntropyFunctor<platform::CUDADeviceContext,
101+
platform::float16>;
92102
} // namespace math
93103
} // namespace operators
94104
} // namespace paddle

0 commit comments

Comments
 (0)