fix gpu test, clean code and add cmake

kexinzhao · kexinzhao · commit 41bd1f9115c4 · 2017-11-28T23:14:59.000-08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <cstdint>
+#include <stdint.h>
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
 public:
   uint16_t x;
 
+  // Constructors
   HOSTDEVICE inline float16() : x(0) {}
 
   HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
@@ -89,8 +90,7 @@ struct PADDLE_ALIGN(2) float16 {
 
 #ifdef PADDLE_WITH_NATIVE_FP16
   // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16 in arm_fp16.h,
-  // which is included in arm_neon.h.
+  // float16_t is an alias for __fp16
   HOSTDEVICE inline explicit float16(const float16_t& h) {
     x = *reinterpret_cast<const uint16_t*>(&h);
   }
@@ -141,6 +141,7 @@ struct PADDLE_ALIGN(2) float16 {
     return *this;
   }
 
+// Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
@@ -219,6 +220,7 @@ struct PADDLE_ALIGN(2) float16 {
     return *this;
   }
 
+// Conversion opertors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit operator half() const {
 #if CUDA_VERSION >= 9000
@@ -353,27 +355,54 @@ struct PADDLE_ALIGN(2) float16 {
 // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
 // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
 // CUDA 9.0 regarding the half data type.
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
-    __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
 DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
 }
 
 DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
 }
 
 DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
 }
 
 DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
 }
 
-DEVICE inline half operator-(const half& a) { return __hneg(a); }
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
 
 DEVICE inline half& operator+=(half& a, const half& b) {
   a = a + b;
@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
 }
 
 DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
 }
 
 DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
 }
 
-/*
-DEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(half(a), half(b)));
-}
-
-DEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(__hsub(half(a), half(b)));
-}
-
-DEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(__hmul(half(a), half(b)));
-}
-
-DEVICE inline float16 operator/(const float16& a, const float16& b) {
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
-  return float16(num / denom);
-}
-
-DEVICE inline float16 operator-(const float16& a) {
-  return float16(__hneg(half(a)));
-}
-
-DEVICE inline float16& operator+=(float16& a, const float16& b) {
-  a = a + b;
-  return a;
-}
-
-DEVICE inline float16& operator-=(float16& a, const float16& b) {
-  a = a - b;
-  return a;
-}
-
-DEVICE inline float16& operator*=(float16& a, const float16& b) {
-  a = a * b;
-  return a;
-}
-
-DEVICE inline float16& operator/=(float16& a, const float16& b) {
-  a = a / b;
-  return a;
-}
-
-DEVICE inline bool operator==(const float16& a, const float16& b) {
-  return __heq(half(a), half(b));
-}
-
-DEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return __hne(half(a), half(b));
-}
-
-DEVICE inline bool operator<(const float16& a, const float16& b) {
-  return __hlt(half(a), half(b));
-}
-
-DEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return __hle(half(a), half(b));
-}
-
-DEVICE inline bool operator>(const float16& a, const float16& b) {
-  return __hgt(half(a), half(b));
-}
-
-DEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return __hge(half(a), half(b));
-}
-*/
+#endif  // PADDLE_CUDA_FP16
 
 // Arithmetic operators on ARMv8.2-A CPU
-#elif defined(PADDLE_WITH_NATIVE_FP16)
+#if defined(PADDLE_WITH_NATIVE_FP16)
 HOST inline float16 operator+(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-/*
-HOST inline float16 operator+(const float16& a, const float16& b) {
-  return float16(vaddh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator-(const float16& a, const float16& b) {
-  return float16(vsubh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator*(const float16& a, const float16& b) {
-  return float16(vmulh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator/(const float16& a, const float16& b) {
-  return float16(vdivh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline float16 operator-(const float16& a) {
-  return float16(vnegh_f16(float16_t(a)));
-}
-
-HOST inline float16& operator+=(float16& a, const float16& b) {
-  a = a + b;
-  return a;
-}
-
-HOST inline float16& operator-=(float16& a, const float16& b) {
-  a = a - b;
-  return a;
-}
-
-HOST inline float16& operator*=(float16& a, const float16& b) {
-  a = a * b;
-  return a;
-}
-
-HOST inline float16& operator/=(float16& a, const float16& b) {
-  a = a / b;
-  return a;
-}
-
-HOST inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
-}
-
-HOST inline bool operator!=(const float16& a, const float16& b) {
-  return !(a == b);
-}
-
-HOST inline bool operator<(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) < float(b);
-#endif  // PADDLE_NEON_64
-}
-
-HOST inline bool operator<=(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) <= float(b);
-#endif  // PADDLE_NEON_64
-}
-
-HOST inline bool operator>(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) > float(b);
-#endif  // PADDLE_NEON_64
-}
-
-HOST inline bool operator>=(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) >= float(b);
-#endif  // PADDLE_NEON_64
-}
-*/
-
 // Arithmetic operators, software emulated on other CPU
 #else
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(float16(true).x, 0x3c00);
   EXPECT_EQ(float16(false).x, 0x0000);
 
-  // Implicit conversion to and from Eigen::half
-  /*
-  Eigen::half tmp = float16(1.0f);
-  float16 v_conv = tmp;
-  EXPECT_EQ(tmp.x, 0x3c00);
-  EXPECT_EQ(v_conv.x, 0x3c00);
-  */
-
   // Default constructor
   float16 v_def;
   EXPECT_EQ(v_def.x, 0x0000);
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu