@@ -20,7 +20,7 @@ limitations under the License. */
20
20
#include < istream>
21
21
#include < ostream>
22
22
23
- #include < cuda.h >
23
+ #define USE_EIGEN
24
24
25
25
#ifdef USE_EIGEN // delete this #if macro
26
26
#include " Eigen/src/Core/arch/CUDA/Half.h"
@@ -100,8 +100,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
100
100
struct PADDLE_ALIGN (2 ) float16 {
101
101
uint16_t x;
102
102
103
- // explicit for different types, implicit for half and Eigen::half
104
-
105
103
PADDLE_HOSTDEVICE inline float16 () {}
106
104
107
105
PADDLE_HOSTDEVICE inline float16 (const float16& h) : x (h.x ) {}
@@ -120,7 +118,8 @@ struct PADDLE_ALIGN(2) float16 {
120
118
PADDLE_HOSTDEVICE inline float16 (const Eigen::half& h) : x (h.x ) {}
121
119
#endif // USE_EIGEN
122
120
123
- #ifdef PADDLE_NEON
121
+ #if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
122
+ defined (PADDLE_NEON) && defined (PADDLE_ARM_FP16)
124
123
// __fp16 is a native half precision data type for arm cpu,
125
124
// float16_t is an alias for __fp16 in arm_fp16.h,
126
125
// which is included in arm_neon.h.
@@ -208,7 +207,8 @@ struct PADDLE_ALIGN(2) float16 {
208
207
}
209
208
#endif // USE_EIGEN
210
209
211
- #ifdef PADDLE_NEON
210
+ #if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
211
+ defined (PADDLE_NEON) && defined (PADDLE_ARM_FP16)
212
212
PADDLE_HOSTDEVICE inline float16& operator =(const float16_t * rhs) {
213
213
x = *reinterpret_cast <uint16_t *>(rhs);
214
214
return *this ;
@@ -302,7 +302,8 @@ struct PADDLE_ALIGN(2) float16 {
302
302
}
303
303
#endif // USE_EIGEN
304
304
305
- #ifdef PADDLE_NEON
305
+ #if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
306
+ defined (PADDLE_NEON) && defined (PADDLE_ARM_FP16)
306
307
// check whether it works or not
307
308
PADDLE_HOSTDEVICE inline operator float16_t () const {
308
309
float16 h = *this ;
@@ -371,7 +372,6 @@ __device__ inline float16 operator*(const float16& a, const float16& b) {
371
372
372
373
__device__ inline float16 operator /(const float16& a, const float16& b) {
373
374
// TODO(kexinzhao): check the cuda version that starts to support __hdiv
374
- // instinsic
375
375
float num = __half2float (half (a));
376
376
float denom = __half2float (half (b));
377
377
return float16 (num / denom);
@@ -595,7 +595,7 @@ constexpr int32_t minD = minC - subC - 1;
595
595
PADDLE_HOSTDEVICE inline float16 float_to_half_rn (float f) {
596
596
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
597
597
half tmp = __float2half (f);
598
- return *reinterpret_cast <float16*>(&( tmp) );
598
+ return *reinterpret_cast <float16*>(&tmp);
599
599
600
600
#elif defined(PADDLE_NEON_64) // test on RPI
601
601
float16 res;
0 commit comments