Skip to content

Commit 5271c32

Browse files
authored
Merge pull request #9223 from kexinzhao/dropout_fp16
Add float16 support to dropout operator
2 parents 832deee + 509c839 commit 5271c32

File tree

5 files changed

+184
-38
lines changed

5 files changed

+184
-38
lines changed

paddle/fluid/operators/dropout_op.cc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel {
3535
}
3636
};
3737

38-
template <typename AttrType>
3938
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
4039
public:
4140
DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,7 +72,6 @@ are set equal to their corresponding inputs.
7372
}
7473
};
7574

76-
template <typename AttrType>
7775
class DropoutOpGrad : public framework::OperatorWithKernel {
7876
public:
7977
using framework::OperatorWithKernel::OperatorWithKernel;
@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
103101
} // namespace paddle
104102

105103
namespace ops = paddle::operators;
106-
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
107-
ops::DropoutOpGrad<float>);
104+
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
105+
ops::DropoutOpGrad);
108106
REGISTER_OP_CPU_KERNEL(
109-
dropout,
110-
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
107+
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
111108
REGISTER_OP_CPU_KERNEL(
112109
dropout_grad,
113110
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);

paddle/fluid/operators/dropout_op.cu

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,18 @@ limitations under the License. */
1818
#include <thrust/random.h>
1919
#include <thrust/transform.h>
2020
#include "paddle/fluid/operators/dropout_op.h"
21+
#include "paddle/fluid/platform/float16.h"
2122

2223
namespace paddle {
2324
namespace operators {
2425

25-
template <typename T, typename AttrType>
26+
template <typename T>
2627
__global__ void RandomGenerator(const size_t n, const int seed,
27-
const AttrType dropout_prob, const T* src,
28+
const float dropout_prob, const T* src,
2829
T* mask_data, T* dst) {
2930
thrust::minstd_rand rng;
3031
rng.seed(seed);
31-
thrust::uniform_real_distribution<AttrType> dist(0, 1);
32+
thrust::uniform_real_distribution<float> dist(0, 1);
3233

3334
int idx = blockDim.x * blockIdx.x + threadIdx.x;
3435
for (; idx < n; idx += blockDim.x * gridDim.x) {
@@ -44,14 +45,14 @@ __global__ void RandomGenerator(const size_t n, const int seed,
4445
// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
4546
// Use std::random and thrust::random(thrust is a std library in CUDA) to
4647
// implement uniform random.
47-
template <typename Place, typename T, typename AttrType>
48+
template <typename Place, typename T>
4849
class GPUDropoutKernel : public framework::OpKernel<T> {
4950
public:
5051
void Compute(const framework::ExecutionContext& context) const override {
5152
auto* x = context.Input<Tensor>("X");
5253
auto* y = context.Output<Tensor>("Out");
5354
y->mutable_data<T>(context.GetPlace());
54-
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
55+
float dropout_prob = context.Attr<float>("dropout_prob");
5556

5657
auto X = EigenMatrix<T>::Reshape(*x, 1);
5758
auto Y = EigenMatrix<T>::Reshape(*y, 1);
@@ -70,11 +71,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
7071

7172
int threads = 512;
7273
int grid = (x->numel() + threads - 1) / threads;
73-
RandomGenerator<T, AttrType><<<grid, threads, 0,
74-
context.cuda_device_context().stream()>>>(
74+
RandomGenerator<
75+
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
7576
size, seed, dropout_prob, x_data, mask_data, y_data);
7677
} else {
77-
Y.device(place) = X * (1.0f - dropout_prob);
78+
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
7879
}
7980
}
8081
};
@@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
8384
} // namespace paddle
8485

8586
namespace ops = paddle::operators;
87+
namespace plat = paddle::platform;
8688
REGISTER_OP_CUDA_KERNEL(
87-
dropout,
88-
ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
89-
REGISTER_OP_CUDA_KERNEL(
90-
dropout_grad,
91-
ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
89+
dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
90+
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
91+
REGISTER_OP_CUDA_KERNEL(dropout_grad,
92+
ops::DropoutGradKernel<plat::CUDADeviceContext, float>);

paddle/fluid/operators/dropout_op.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
2525
typename IndexType = Eigen::DenseIndex>
2626
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
2727

28-
template <typename DeviceContext, typename T, typename AttrType>
28+
template <typename DeviceContext, typename T>
2929
class CPUDropoutKernel : public framework::OpKernel<T> {
3030
public:
3131
void Compute(const framework::ExecutionContext& context) const override {

paddle/fluid/platform/float16.h

Lines changed: 133 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -483,8 +483,123 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
483483

484484
#endif // PADDLE_CUDA_FP16
485485

486-
// Arithmetic operators on ARMv8.2-A CPU
487-
#if defined(PADDLE_WITH_NATIVE_FP16)
486+
// Arithmetic operators for float16 on GPU
487+
#if defined(PADDLE_CUDA_FP16)
488+
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
489+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
490+
return float16(__hadd(half(a), half(b)));
491+
#else
492+
return float16(float(a) + float(b));
493+
#endif
494+
}
495+
496+
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
497+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
498+
return float16(__hsub(half(a), half(b)));
499+
#else
500+
return float16(float(a) - float(b));
501+
#endif
502+
}
503+
504+
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
505+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
506+
return float16(__hmul(half(a), half(b)));
507+
#else
508+
return float16(float(a) * float(b));
509+
#endif
510+
}
511+
512+
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
513+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
514+
// TODO(kexinzhao): check which cuda version starts to support __hdiv
515+
float num = __half2float(half(a));
516+
float denom = __half2float(half(b));
517+
return float16(num / denom);
518+
#else
519+
return float16(float(a) / float(b));
520+
#endif
521+
}
522+
523+
HOSTDEVICE inline float16 operator-(const float16& a) {
524+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
525+
return float16(__hneg(half(a)));
526+
#else
527+
float16 res;
528+
res.x = a.x ^ 0x8000;
529+
return res;
530+
#endif
531+
}
532+
533+
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
534+
a = a + b;
535+
return a;
536+
}
537+
538+
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
539+
a = a - b;
540+
return a;
541+
}
542+
543+
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
544+
a = a * b;
545+
return a;
546+
}
547+
548+
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
549+
a = a / b;
550+
return a;
551+
}
552+
553+
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
554+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
555+
return __heq(half(a), half(b));
556+
#else
557+
return float(a) == float(b);
558+
#endif
559+
}
560+
561+
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
562+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
563+
return __hne(half(a), half(b));
564+
#else
565+
return float(a) != float(b);
566+
#endif
567+
}
568+
569+
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
570+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
571+
return __hlt(half(a), half(b));
572+
#else
573+
return float(a) < float(b);
574+
#endif
575+
}
576+
577+
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
578+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
579+
return __hle(half(a), half(b));
580+
#else
581+
return float(a) <= float(b);
582+
#endif
583+
}
584+
585+
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
586+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
587+
return __hgt(half(a), half(b));
588+
#else
589+
return float(a) > float(b);
590+
#endif
591+
}
592+
593+
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
594+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
595+
return __hge(half(a), half(b));
596+
#else
597+
return float(a) >= float(b);
598+
#endif
599+
}
600+
601+
// Arithmetic operators for float16 on ARMv8.2-A CPU
602+
#elif defined(PADDLE_WITH_NATIVE_FP16)
488603
HOST inline float16 operator+(const float16& a, const float16& b) {
489604
float16 res;
490605
asm volatile(
@@ -668,71 +783,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
668783
return (res & 0xffff) != 0;
669784
}
670785

671-
// Arithmetic operators, software emulated on other CPU
786+
// Arithmetic operators for float16, software emulated on other CPU
672787
#else
673-
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
788+
HOST inline float16 operator+(const float16& a, const float16& b) {
674789
return float16(float(a) + float(b));
675790
}
676791

677-
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
792+
HOST inline float16 operator-(const float16& a, const float16& b) {
678793
return float16(float(a) - float(b));
679794
}
680795

681-
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
796+
HOST inline float16 operator*(const float16& a, const float16& b) {
682797
return float16(float(a) * float(b));
683798
}
684799

685-
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
800+
HOST inline float16 operator/(const float16& a, const float16& b) {
686801
return float16(float(a) / float(b));
687802
}
688803

689-
HOSTDEVICE inline float16 operator-(const float16& a) {
804+
HOST inline float16 operator-(const float16& a) {
690805
float16 res;
691806
res.x = a.x ^ 0x8000;
692807
return res;
693808
}
694809

695-
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
810+
HOST inline float16& operator+=(float16& a, const float16& b) {
696811
a = float16(float(a) + float(b));
697812
return a;
698813
}
699814

700-
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
815+
HOST inline float16& operator-=(float16& a, const float16& b) {
701816
a = float16(float(a) - float(b));
702817
return a;
703818
}
704819

705-
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
820+
HOST inline float16& operator*=(float16& a, const float16& b) {
706821
a = float16(float(a) * float(b));
707822
return a;
708823
}
709824

710-
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
825+
HOST inline float16& operator/=(float16& a, const float16& b) {
711826
a = float16(float(a) / float(b));
712827
return a;
713828
}
714829

715-
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
830+
HOST inline bool operator==(const float16& a, const float16& b) {
716831
return float(a) == float(b);
717832
}
718833

719-
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
834+
HOST inline bool operator!=(const float16& a, const float16& b) {
720835
return float(a) != float(b);
721836
}
722837

723-
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
838+
HOST inline bool operator<(const float16& a, const float16& b) {
724839
return float(a) < float(b);
725840
}
726841

727-
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
842+
HOST inline bool operator<=(const float16& a, const float16& b) {
728843
return float(a) <= float(b);
729844
}
730845

731-
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
846+
HOST inline bool operator>(const float16& a, const float16& b) {
732847
return float(a) > float(b);
733848
}
734849

735-
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
850+
HOST inline bool operator>=(const float16& a, const float16& b) {
736851
return float(a) >= float(b);
737852
}
738853
#endif

python/paddle/fluid/tests/unittests/test_dropout_op.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import unittest
1616
import numpy as np
17+
import paddle.fluid.core as core
1718
from op_test import OpTest
1819

1920

@@ -82,5 +83,37 @@ def test_check_output(self):
8283
self.check_output()
8384

8485

86+
class TestFP16DropoutOp(OpTest):
87+
def setUp(self):
88+
self.op_type = "dropout"
89+
self.init_test_case()
90+
91+
x = np.random.random(self.input_size).astype("float16")
92+
out = x * (1.0 - self.prob)
93+
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
94+
self.attrs = {
95+
'dropout_prob': self.prob,
96+
'fix_seed': self.fix_seed,
97+
'is_test': True
98+
}
99+
self.outputs = {'Out': out}
100+
101+
def init_test_case(self):
102+
self.input_size = [32, 64]
103+
self.prob = 0.35
104+
self.fix_seed = True
105+
106+
def test_check_output(self):
107+
if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
108+
self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
109+
110+
111+
class TestFP16DropoutOp2(TestFP16DropoutOp):
112+
def init_test_case(self):
113+
self.input_size = [32, 64, 3]
114+
self.prob = 0.75
115+
self.fix_seed = False
116+
117+
85118
if __name__ == '__main__':
86119
unittest.main()

0 commit comments

Comments
 (0)