PaddlePaddle
diff --git a/‎paddle/cinn/backends/hip/codegen_hip_dev.cc
Lines changed: 4 additions & 1 deletion b/‎paddle/cinn/backends/hip/codegen_hip_dev.cc
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/cinn/common/float16.h
Lines changed: 70 additions & 31 deletions b/‎paddle/cinn/common/float16.h
Lines changed: 70 additions & 31 deletions
diff --git a/‎paddle/cinn/runtime/hip/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎paddle/cinn/runtime/hip/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
@@ -19,7 +19,10 @@ namespace backends {
 namespace hip {
 
 const std::string CodeGenHipDevice::source_header_ =  // NOLINT
-    R"(#include "cinn_hip_runtime_source.h"
+    R"(#define CINN_WITH_HIP
+     #include "float16.h"
+     using cinn::common::float16;
+     #include "cinn_hip_runtime_source.h"
 )";
 
 const std::string &CodeGenHipDevice::GetSourceHeader() {
 
@@ -40,6 +40,15 @@
 #endif  // __CUDACC__
 #endif  // CINN_WITH_CUDA
 
+#ifdef CINN_WITH_HIP
+#include <hip/hip_runtime.h>
+#if defined(__HIPCC__)
+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_fp16.h>
+#define CINN_HIP_FP16
+#endif
+#endif
+
 #ifdef __cplusplus
 #ifndef _WIN32
 #define CINN_ALIGN(x) __attribute__((aligned(x)))
@@ -83,7 +92,7 @@ struct CINN_ALIGN(2) float16 {
   ~float16() = default;
 
 // Constructors
-#ifdef CINN_CUDA_FP16
+#if defined(CINN_CUDA_FP16) || defined(CINN_HIP_FP16)
   __host__ __device__ inline explicit float16(const half& h) {
 #if (CUDA_VERSION >= 9000)
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
@@ -129,9 +138,9 @@ struct CINN_ALIGN(2) float16 {
       : x(float16(static_cast<float>(val)).x) {}
 
 // Assignment operators
-#ifdef CINN_CUDA_FP16
+#if defined(CINN_CUDA_FP16) || defined(CINN_HIP_FP16)
   __host__ __device__ inline float16& operator=(const half& rhs) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(CINN_HIP_FP16)
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
@@ -196,9 +205,9 @@ struct CINN_ALIGN(2) float16 {
   }
 
 // Conversion operators
-#ifdef CINN_CUDA_FP16
+#if defined(CINN_CUDA_FP16) || defined(CINN_HIP_FP16)
   __host__ __device__ inline half to_half() const {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(CINN_HIP_FP16)
     __half_raw h;
     h.x = x;
     return half(h);
@@ -211,7 +220,9 @@ struct CINN_ALIGN(2) float16 {
 #endif  // CINN_CUDA_FP16
 
   __host__ __device__ inline operator float() const {
-#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)
+#if defined(CINN_CUDA_FP16) &&                              \
+        (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
+    defined(CINN_HIP_FP16)
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -344,9 +355,9 @@ struct CINN_ALIGN(4) float162 {
 // CUDA 9.0 regarding the half data type.
 // ROCM has built-in arithmetic operators as not defined
 // __HIP_NO_HALF_OPERATORS__
-#if defined(CINN_CUDA_FP16) && CUDA_VERSION < 9000
+#if (defined(CINN_CUDA_FP16) && CUDA_VERSION < 9000) || defined(CINN_HIP_FP16)
 __device__ inline half operator+(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hadd(a, b);
 #else
   float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
@@ -355,7 +366,7 @@ __device__ inline half operator+(const half& a, const half& b) {
 }
 
 __device__ inline half operator-(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hsub(a, b);
 #else
   float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
@@ -364,7 +375,7 @@ __device__ inline half operator-(const half& a, const half& b) {
 }
 
 __device__ inline half operator*(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hmul(a, b);
 #else
   float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
@@ -373,7 +384,7 @@ __device__ inline half operator*(const half& a, const half& b) {
 }
 
 __device__ inline half operator/(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
@@ -384,14 +395,15 @@ __device__ inline half operator/(const half& a, const half& b) {
 }
 
 __device__ inline half operator-(const half& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hneg(a);
 #else
   float res = -static_cast<float>(float16(a));
   return float16(res).to_half();
 #endif
 }
 
+#ifndef CINN_WITH_HIP
 __device__ inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
@@ -411,49 +423,50 @@ __device__ inline half& operator/=(half& a, const half& b) {  // NOLINT
   a = a / b;
   return a;
 }
+#endif
 
 __device__ inline bool operator==(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __heq(a, b);
 #else
   return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
 #endif
 }
 
 __device__ inline bool operator!=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hne(a, b);
 #else
   return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
 #endif
 }
 
 __device__ inline bool operator<(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hlt(a, b);
 #else
   return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
 #endif
 }
 
 __device__ inline bool operator<=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hle(a, b);
 #else
   return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
 #endif
 }
 
 __device__ inline bool operator>(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hgt(a, b);
 #else
   return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
 #endif
 }
 
 __device__ inline bool operator>=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(CINN_HIP_FP16)
   return __hge(a, b);
 #else
   return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
@@ -465,7 +478,9 @@ __device__ inline bool operator>=(const half& a, const half& b) {
 // Arithmetic operators for float16 on GPU
 __host__ __device__ inline float16 operator+(const float16& a,
                                              const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return float16(__hadd(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) + static_cast<float>(b));
@@ -474,7 +489,9 @@ __host__ __device__ inline float16 operator+(const float16& a,
 
 __host__ __device__ inline float16 operator-(const float16& a,
                                              const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return float16(__hsub(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) - static_cast<float>(b));
@@ -483,7 +500,9 @@ __host__ __device__ inline float16 operator-(const float16& a,
 
 __host__ __device__ inline float16 operator*(const float16& a,
                                              const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return float16(__hmul(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) * static_cast<float>(b));
@@ -492,7 +511,9 @@ __host__ __device__ inline float16 operator*(const float16& a,
 
 __host__ __device__ inline float16 operator/(const float16& a,
                                              const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   // TODO(kexinzhao): check which cuda version starts to support __hdiv
   float num = __half2float(a.to_half());
   float denom = __half2float(b.to_half());
@@ -503,7 +524,9 @@ __host__ __device__ inline float16 operator/(const float16& a,
 }
 
 __host__ __device__ inline float16 operator-(const float16& a) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return float16(__hneg(a.to_half()));
 #else
   float16 res;
@@ -537,47 +560,59 @@ __host__ __device__ inline float16& operator/=(float16& a,          // NOLINT
 }
 
 __host__ __device__ inline bool operator==(const float16& a, const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __heq(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) == static_cast<float>(b);
 #endif
 }
 
 __host__ __device__ inline bool operator!=(const float16& a, const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __hne(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) != static_cast<float>(b);
 #endif
 }
 
 __host__ __device__ inline bool operator<(const float16& a, const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __hlt(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) < static_cast<float>(b);
 #endif
 }
 
 __host__ __device__ inline bool operator<=(const float16& a, const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __hle(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) <= static_cast<float>(b);
 #endif
 }
 
 __host__ __device__ inline bool operator>(const float16& a, const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __hgt(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) > static_cast<float>(b);
 #endif
 }
 
 __host__ __device__ inline bool operator>=(const float16& a, const float16& b) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __hge(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) >= static_cast<float>(b);
@@ -592,7 +627,9 @@ __host__ __device__ inline float16 raw_uint16_to_float16(uint16_t a) {
 }
 
 __host__ __device__ inline bool(isnan)(const float16& a) {
-#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return __hisnan(a.to_half());
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -608,7 +645,9 @@ __host__ __device__ inline bool(isfinite)(const float16& a) {
 }
 
 __host__ __device__ inline float16(abs)(const float16& a) {
-#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if (defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+     __CUDA_ARCH__ >= 530) ||                             \
+    defined(CINN_HIP_FP16)
   return static_cast<float16>(__habs(a.to_half()));
 #else
   return static_cast<float16>(fabsf(static_cast<float>(a)));
 
@@ -7,4 +7,5 @@ gather_srcs(
   hip_backend_api.cc
   hip_module.cc
   hip_intrinsics.cc
-  hip_intrinsics_reduce.cc)
+  hip_intrinsics_reduce.cc
+  hip_intrinsics_float16.cc)