[AMD] Support to Bf16->OCP Fp8 conversion on CDNA3 (#7469)

yiqian1 · web-flow · commit 732c0db1c89f · 2025-07-11T13:56:24.000-07:00
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -7,7 +7,7 @@
 import triton
 import triton.language as tl
 
-from triton._internal_testing import is_cuda, is_hip, is_hip_cdna2, is_hip_cdna3, is_hip_cdna4
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3, is_hip_cdna4
 
 
 def matching_int(dtype):
@@ -341,12 +341,6 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
 
     if is_hip():
-        if dst_dtype == 'float8e4nv':
-            if not rounding == 'rtne':
-                pytest.skip("float8e4nv downcast tests only supported with RTNE rounding on AMDGPU")
-            if not is_hip_cdna4() and src_dtype == 'bfloat16':
-                pytest.skip("float8e4nv downcast tests from bfloat16 only supported on AMDGPU CDNA4")
-
         if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne' and not is_hip_cdna3():
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
 
@@ -376,9 +370,6 @@ def test_typeconvert_downcast_clamping(src_dtype, dst_dtype, mode, device, round
 
         if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
-    elif is_hip_cdna2() or is_hip_cdna3():
-        if src_dtype == 'bfloat16' and dst_dtype == 'float8e4nv':
-            pytest.skip(f"{src_dtype} downcast to {dst_dtype} with clamping is not fully tested on AMDGPU CDNA2/3")
 
     converter = {
         tl.float8e4nv: torch.float8_e4m3fn,
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -65,6 +65,12 @@ template <typename FPType> struct FPTypeInfo {
       if (dstTyID == TypeID::get<Float8E5M2Type>())
         return {0x0080, 0x0180, 0x0200, 0x0380};
     }
+    if constexpr (std::is_same_v<FPType, BFloat16Type>) {
+      if (dstTyID == TypeID::get<Float8E4M3FNType>())
+        return {0x3a80, 0x3b40, 0x3ba0, 0x3be0, 0x3c10, 0x3c30, 0x3c50, 0x3c70};
+      if (dstTyID == TypeID::get<Float8E5M2Type>())
+        return {0x3700, 0x37c0, 0x3820, 0x3860};
+    }
     return {};
   }
 
@@ -278,13 +284,16 @@ static Value Fp_to_Fp8E4M3FN_RTNE_oneValue(Location loc,
                                            ConversionPatternRewriter &rewriter,
                                            Value v) {
   static_assert((std::is_same_v<SrcFPType, Float32Type>) ||
-                (std::is_same_v<SrcFPType, Float16Type>));
+                (std::is_same_v<SrcFPType, Float16Type>) ||
+                (std::is_same_v<SrcFPType, BFloat16Type>));
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   const llvm::fltSemantics *srcSemantic = nullptr;
   if constexpr (std::is_same_v<SrcFPType, Float32Type>)
     srcSemantic = &llvm::APFloat::IEEEsingle();
-  else
+  else if constexpr (std::is_same_v<SrcFPType, Float16Type>)
     srcSemantic = &llvm::APFloat::IEEEhalf();
+  else
+    srcSemantic = &llvm::APFloat::BFloat();
   auto srcWidth = llvm::APFloat::getSizeInBits(*srcSemantic);
   auto srcMantissaBits = llvm::APFloat::semanticsPrecision(*srcSemantic) - 1;
   auto srcExponentBits = srcWidth - srcMantissaBits - 1;
@@ -365,8 +374,10 @@ static Value Fp_to_Fp8E4M3FN_RTNE_oneValue(Location loc,
   // number(including infinity) after rounding in FP8E4M3
   if constexpr (std::is_same_v<SrcFPType, Float32Type>)
     dstMaxOfSrcType |= 0x7ffff;
-  else
+  else if constexpr (std::is_same_v<SrcFPType, Float16Type>)
     dstMaxOfSrcType |= 0x7f;
+  else
+    dstMaxOfSrcType |= 0x7;
   Value isOverflowOrInf =
       b.icmp_ugt(intVal, srcFpInfo.toLLVMIntValue(dstMaxOfSrcType));
   vFp8 =
@@ -1168,15 +1179,30 @@ static ConverterT Bf16_to_Fp8E5M2(AMD::ISAFamily isaFamily) {
   return isaFamily == AMD::ISAFamily::CDNA4 ? Bf16_to_Fp8E5M2_HW
                                             : Bf16_to_Fp8E5M2_SW;
 }
-// Bf16 -> OCP Fp8
-static SmallVector<Value> Bf16_to_Fp8E4M3FN(Location loc,
-                                            ConversionPatternRewriter &rewriter,
-                                            const SmallVector<Value> &v) {
+
+// Bf16 -> OCP Fp8 using RTNE
+static SmallVector<Value>
+Bf16_to_Fp8E4M3FN_RTNE_SW(Location loc, ConversionPatternRewriter &rewriter,
+                          const SmallVector<Value> &v) {
+  SmallVector<Value> result(2);
+  result[0] = Fp_to_Fp8E4M3FN_RTNE_oneValue<BFloat16Type>(loc, rewriter, v[0]);
+  result[1] = Fp_to_Fp8E4M3FN_RTNE_oneValue<BFloat16Type>(loc, rewriter, v[1]);
+  return result;
+}
+
+static SmallVector<Value>
+Bf16_to_Fp8E4M3FN_RTNE_HW(Location loc, ConversionPatternRewriter &rewriter,
+                          const SmallVector<Value> &v) {
   assert(v.size() == 2);
   return cvtScalePkDowncastToFp8<ROCDL::CvtScaleF32PkFp8Bf16Op>(loc, rewriter,
                                                                 v[0], v[1]);
 }
 
+ConverterT Bf16_to_Fp8E4M3FN(AMD::ISAFamily isaFamily) {
+  return isaFamily == AMD::ISAFamily::CDNA4 ? Bf16_to_Fp8E4M3FN_RTNE_HW
+                                            : Bf16_to_Fp8E4M3FN_RTNE_SW;
+}
+
 // fp8e4m3fn to bf16
 static SmallVector<Value>
 Fp8E4M3FN_to_Bf16_SW(Location loc, ConversionPatternRewriter &rewriter,
@@ -1472,7 +1498,8 @@ struct FpToFpOpConversion
             // BF16 -> F8
             {{BF16TyID, F8E5M2TyID, RoundingMode::RTNE},
              Bf16_to_Fp8E5M2(isaFamily)},
-            {{BF16TyID, F8E4M3FNTyID, RoundingMode::RTNE}, Bf16_to_Fp8E4M3FN},
+            {{BF16TyID, F8E4M3FNTyID, RoundingMode::RTNE},
+             Bf16_to_Fp8E4M3FN(isaFamily)},
             {{BF16TyID, F8E5M2FNUZTyID, RoundingMode::RTNE},
              Bf16_to_Fp8E5M2FNUZ},
             {{BF16TyID, F8E4M3FNUZTyID, RoundingMode::RTNE},