Support fp16 in GPU impl of fused_elemwise_activation_op. (#20636) (#20655)

qingqing01 · web-flow · commit 194f3dcf8e19 · 2019-10-16T14:19:33.000+08:00
* Support fp16 in fused_elemwise_activation_op.
* Fix unit testing in ONLY-CPU mode.
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
@@ -20,11 +20,15 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
                                        float>,
     ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
+                                       double>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     fused_elemwise_activation_grad,
     ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
                                            float>,
     ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           double>);
+                                           double>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/math.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,8 +42,8 @@ struct AddFunctor {
 
 template <typename T>
 struct AddGradFunctor {
-  inline HOSTDEVICE T Dx(T x, T y) { return 1; }
-  inline HOSTDEVICE T Dy(T x, T y) { return 1; }
+  inline HOSTDEVICE T Dx(T x, T y) { return static_cast<T>(1.); }
+  inline HOSTDEVICE T Dy(T x, T y) { return static_cast<T>(1.); }
 };
 
 template <typename T>
@@ -68,14 +70,22 @@ struct ScaleGradFunctor {
 
 template <typename T>
 struct ReluFunctor {
-  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+  inline HOSTDEVICE T operator()(T x) {
+    return x * (x > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0));
+  }
 };
 
 template <typename T>
 struct ReluGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; }
-  inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; }
+  inline HOSTDEVICE T UseX(T x) {
+    return x > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
+  }
+  inline HOSTDEVICE T UseOut(T out) {
+    return out > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
+  }
+  inline HOSTDEVICE T UseXAndOut(T x, T out) {
+    return out > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
+  }
 };
 
 template <typename T>
@@ -84,9 +94,9 @@ struct TanhFunctor {
   const T kMax = static_cast<T>(13);
   inline HOSTDEVICE T operator()(T x) {
     // y = 2 / (1 + e^-2x) - 1
-    T t0 = 2 * x;
+    T t0 = static_cast<T>(2) * x;
     T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
-    return static_cast<T>(2) / (static_cast<T>(1) + std::exp(-t1)) -
+    return static_cast<T>(2) / (static_cast<T>(1) + real_exp(-t1)) -
            static_cast<T>(1);
   }
 };
@@ -107,7 +117,7 @@ struct SigmoidFunctor {
   inline HOSTDEVICE T operator()(T x) {
     // y = 1 / (1 + e^-x)
     T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
-    return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+    return static_cast<T>(1) / (static_cast<T>(1) + real_exp(-tmp));
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -33,17 +33,24 @@
 #   TestFusedElementwiseActivationOp_channelwise_add
 
 
-def create_test_class(test_case, callback, attrs):
+def create_test_class(test_case,
+                      callback,
+                      attrs,
+                      dtype=np.float32,
+                      grad_chek=True):
     class TestFusedElementwiseActivationOp_base(OpTest):
         def setUp(self):
             self.op_type = "fused_elemwise_activation"
-            self.dtype = np.float32
+            self.dtype = dtype
             self.axis = -1
 
             self.init_input()
             self.init_output()
             self.init_attr()
 
+            self.out = self.out.astype(self.dtype)
+            self.intermediate_out = self.intermediate_out.astype(self.dtype)
+
             self.inputs = {
                 'X': OpTest.np_dtype_to_fluid_dtype(self.x),
                 'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
@@ -71,16 +78,25 @@ def init_attr(self):
                 self.attrs[key] = attrs[key]
 
         def test_check_output(self):
-            self.check_output()
+            if self.dtype == np.float16 and core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=1e-3)
+            else:
+                self.check_output()
 
         # FIXME(zcd): the intermediate_out_grad is not checked.
         def test_check_grad_normal(self):
+            if not grad_chek:
+                return
             if self.attrs["save_intermediate_out"]:
                 self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
             else:
                 self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
 
         def test_check_grad_ingore_x(self):
+            if not grad_chek:
+                return
             if self.attrs["save_intermediate_out"]:
                 self.check_grad(
                     ['Y'], ['Out'],
@@ -93,6 +109,8 @@ def test_check_grad_ingore_x(self):
                     no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
+            if not grad_chek:
+                return
             if self.attrs["save_intermediate_out"]:
                 self.check_grad(
                     ['X'], ['Out'],
@@ -307,11 +325,29 @@ def mul_scale_func(x, y, x_bcast, y_bcast, scale, mode=0):
             'functor_list': ["scale", "elementwise_add"],
             'save_intermediate_out': save_intermediate_out,
         })
+        create_test_class(
+            'scale_add_fp16' + suffix,
+            scale_add_func, {
+                'scale': scale,
+                'functor_list': ["scale", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
         create_test_class('add_scale' + suffix, add_scale_func, {
             'scale': scale,
             'functor_list': ["elementwise_add", "scale"],
             'save_intermediate_out': save_intermediate_out,
         })
+        create_test_class(
+            'add_scale_fp16' + suffix,
+            add_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_add", "scale"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
         create_test_class('add_relu' + suffix, add_relu_func, {
             'functor_list': ["elementwise_add", "relu"],
             'save_intermediate_out': save_intermediate_out,
@@ -320,11 +356,36 @@ def mul_scale_func(x, y, x_bcast, y_bcast, scale, mode=0):
             'functor_list': ["relu", "elementwise_add"],
             'save_intermediate_out': save_intermediate_out,
         })
+        create_test_class(
+            'add_relu_fp16' + suffix,
+            add_relu_func, {
+                'functor_list': ["elementwise_add", "relu"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
+        create_test_class(
+            'relu_add_fp16' + suffix,
+            relu_add_func, {
+                'functor_list': ["relu", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
         create_test_class('mul_scale' + suffix, mul_scale_func, {
             'scale': scale,
             'functor_list': ["elementwise_mul", "scale"],
             'save_intermediate_out': save_intermediate_out,
         })
+        create_test_class(
+            'mul_scale' + suffix,
+            mul_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_mul", "scale"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
 
 if __name__ == '__main__':
     unittest.main()