[AMD] Fix fp32/fp16 to OCP fp8 conversion on MI300 (#7382)

yiqian1 · web-flow · commit fcf3e3e0ac7e · 2025-07-07T10:46:17.000-07:00
The current implementation for converting FP32 to OCP FP8 on MI300
involves two steps: FP32 → FP16 and FP16 → OCP FP8. However, this
approach produces incorrect results for subnormal numbers. To fix the
issue, this patch introduces a direct conversion from FP32 to OCP FP8.
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -344,8 +344,8 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
         if dst_dtype == 'float8e4nv':
             if not rounding == 'rtne':
                 pytest.skip("float8e4nv downcast tests only supported with RTNE rounding on AMDGPU")
-            if not (is_hip_cdna3() and src_dtype == 'float16' or is_hip_cdna4()):
-                pytest.skip("float8e4nv downcast tests only supported on AMDGPU CDNA3 or on CDNA4 and from float16 with RTNE rounding")
+            if not is_hip_cdna4() and src_dtype == 'bfloat16':
+                pytest.skip("float8e4nv downcast tests from bfloat16 only supported on AMDGPU CDNA4")
 
         if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne' and not is_hip_cdna3():
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -257,9 +257,6 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
         if split_k > 1:
             pytest.skip("splitK hasn't been fully tested on AMD GPU.")
 
-        if is_hip_cdna3() and ("float8_e4m3fn" in (weight_dtype_str, act_dtype_str)):
-            pytest.skip("float8_e4m3fn hasn't been fully tested on AMD CDNA3 platform.")
-
     if "float8_e4m3fnuz" in (weight_dtype_str, act_dtype_str) and not is_hip_cdna3():
         pytest.skip("float8_e4m3fnuz only tested on AMD CDNA3 Platform")
 
diff --git a/python/triton_kernels/tests/test_mxfp.py b/python/triton_kernels/tests/test_mxfp.py
@@ -22,7 +22,7 @@
     upcast_from_mxfp_torch,
 )
 from triton_kernels.testing import assert_close, assert_equal
-from triton_kernels.target_info import is_hip, is_hip_cdna3
+from triton_kernels.target_info import is_hip
 
 
 def dtype_str_to_torch(dtype_str: str) -> torch.dtype:
@@ -146,8 +146,6 @@ def test_mxfp_casting(
     if is_hip():
         if swizzle_value is not None or swizzle_scale is not None:
             pytest.skip("Other swizzling patterns are not supported by AMD GPU")
-        if quant_dtype == 'float8_e4m3fn' and is_hip_cdna3():
-            pytest.skip("float8_e4m3fn cast hasn't been fully tested on AMD CDNA3")
 
     swizzle_axis = swizzle_axis if (swizzle_value or swizzle_scale) else None
     quant_torch_type = dtype_str_to_torch(quant_dtype)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -23,6 +23,70 @@ namespace {
 //===----------------------------------------------------------------------===//
 // Data type conversion utility functions
 //===----------------------------------------------------------------------===//
+template <typename FPType> struct FPTypeInfo {
+  FPTypeInfo(Location loc, ConversionPatternRewriter &rewriter,
+             TritonLLVMOpBuilder &builder)
+      : loc(loc), rewriter(rewriter), b(builder) {}
+  IntegerType getIntType() {
+    if constexpr (std::is_same_v<FPType, Float32Type>) {
+      return i32_ty;
+    }
+    if constexpr (std::is_same_v<FPType, Float16Type> ||
+                  std::is_same_v<FPType, BFloat16Type>) {
+      return i16_ty;
+    }
+    if constexpr (std::is_same_v<FPType, Float8E4M3FNType> ||
+                  std::is_same_v<FPType, Float8E5M2Type>) {
+      return i8_ty;
+    }
+    return nullptr;
+  }
+
+  SmallVector<float> getHalfwayPointsForDstType(TypeID dstTyID) {
+    if constexpr (std::is_same_v<FPType, Float32Type>) {
+      if (dstTyID == TypeID::get<Float8E4M3FNType>())
+        return {0x3a800000,  // halfway between [0/8 * 2^-6, 1/8 * 2^-6]
+                0x3b400000,  // halfway between [1/8 * 2^-6, 2/8 * 2^-6]
+                0x3ba00000,  // halfway between [2/8 * 2^-6, 3/8 * 2^-6]
+                0x3be00000,  // halfway between [3/8 * 2^-6, 4/8 * 2^-6]
+                0x3c100000,  // halfway between [4/8 * 2^-6, 5/8 * 2^-6]
+                0x3c300000,  // halfway between [5/8 * 2^-6, 6/8 * 2^-6]
+                0x3c500000,  // halfway between [6/8 * 2^-6, 7/8 * 2^-6]
+                0x3c700000}; // halfway between [7/8 * 2^-6, 8/8 * 2^-6]
+      if (dstTyID == TypeID::get<Float8E5M2Type>())
+        return {0x37000000,  // halfway between [0/4 * 2^(-14), 1/4 * 2^(-14)]
+                0x37c00000,  // halfway between [1/4 * 2^(-14), 2/4 * 2^(-14)]
+                0x38200000,  // halfway between [2/4 * 2^(-14), 3/4 * 2^(-14)]
+                0x38600000}; // halfway between [3/4 * 2^(-14), 4/4 * 2^(-14)]
+    }
+    if constexpr (std::is_same_v<FPType, Float16Type>) {
+      if (dstTyID == TypeID::get<Float8E4M3FNType>())
+        return {0x1400, 0x1A00, 0x1D00, 0x1F00, 0x2080, 0x2180, 0x2280, 0x2380};
+      if (dstTyID == TypeID::get<Float8E5M2Type>())
+        return {0x0080, 0x0180, 0x0200, 0x0380};
+    }
+    return {};
+  }
+
+  Value toLLVMIntValue(int32_t val) {
+    if constexpr (std::is_same_v<FPType, Float32Type>) {
+      return b.i32_val(val);
+    }
+    if constexpr (std::is_same_v<FPType, Float16Type> ||
+                  std::is_same_v<FPType, BFloat16Type>) {
+      return b.i16_val(val);
+    }
+    if constexpr (std::is_same_v<FPType, Float8E4M3FNType> ||
+                  std::is_same_v<FPType, Float8E5M2Type>) {
+      return b.i8_val(val);
+    }
+    return nullptr;
+  }
+  Location loc;
+  ConversionPatternRewriter &rewriter;
+  TritonLLVMOpBuilder &b;
+};
+
 // Convert Ocp Fp8/Bf8 to Fp16/Bf16/Fp32 on CDNA4
 template <typename ConvertOp>
 static SmallVector<Value>
@@ -111,6 +175,7 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter,
 static SmallVector<Value>
 Fp16_to_Fp8E5M2_RTNE_SW(Location loc, ConversionPatternRewriter &rewriter,
                         const SmallVector<Value> &v) {
+
   assert(v.size() == 4);
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
@@ -203,88 +268,155 @@ static Value checkIsNan(TritonLLVMOpBuilder &builder, Value v) {
       ->getResult(0);
 }
 
-// Fp16 -> OCP Fp8 (RTNZ)
-
-// Cast FP16 to FP8E4M3FN in saturation and round-to-nearest-even mode.
+// Cast Fp32 or FP16 to FP8E4M3FN in saturation and round-to-nearest-even mode.
 // According to
 // https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1,
 // In saturation mode, inf and out-of-range numbers are converted to the largest
 // normal number, i.e. ±448. NaNs are converted to NaNs.
-static Value
-Fp16_to_Fp8E4M3FN_RTNE_oneValue(Location loc,
-                                ConversionPatternRewriter &rewriter, Value v) {
+template <typename SrcFPType>
+static Value Fp_to_Fp8E4M3FN_RTNE_oneValue(Location loc,
+                                           ConversionPatternRewriter &rewriter,
+                                           Value v) {
+  static_assert((std::is_same_v<SrcFPType, Float32Type>) ||
+                (std::is_same_v<SrcFPType, Float16Type>));
   auto b = TritonLLVMOpBuilder(loc, rewriter);
+  const llvm::fltSemantics *srcSemantic = nullptr;
+  if constexpr (std::is_same_v<SrcFPType, Float32Type>)
+    srcSemantic = &llvm::APFloat::IEEEsingle();
+  else
+    srcSemantic = &llvm::APFloat::IEEEhalf();
+  auto srcWidth = llvm::APFloat::getSizeInBits(*srcSemantic);
+  auto srcMantissaBits = llvm::APFloat::semanticsPrecision(*srcSemantic) - 1;
+  auto srcExponentBits = srcWidth - srcMantissaBits - 1;
+  auto srcBias = (1 << (srcExponentBits - 1)) - 1;
+
+  const llvm::fltSemantics &dstSemantic = llvm::APFloat::Float8E4M3FN();
+  auto dstWidth = llvm::APFloat::getSizeInBits(dstSemantic);
+  auto dstMantissaBits = llvm::APFloat::semanticsPrecision(dstSemantic) - 1;
+  auto dstExponentBits = dstWidth - dstMantissaBits - 1;
+  auto dstBias = (1 << (dstExponentBits - 1)) - 1;
+
+  FPTypeInfo<SrcFPType> srcFpInfo(loc, rewriter, b);
+  FPTypeInfo<Float8E4M3FNType> dstFpInfo(loc, rewriter, b);
+  auto srcIntType = srcFpInfo.getIntType();
   Value isNaN = checkIsNan(b, v);
+
+  uint32_t reducedMantissaBits = srcMantissaBits - dstMantissaBits;
+  Value reducedMantissaValue = srcFpInfo.toLLVMIntValue(reducedMantissaBits);
+
   // Get sign and absolute value
-  Value vi16 = b.bitcast(v, i16_ty);
+  Value intVal = b.bitcast(v, srcIntType);
+  int32_t signMask = 1 << (srcWidth - 1);
   Value sign =
-      b.trunc(i8_ty, b.lshr(b.and_(vi16, b.i16_val(0x8000)), b.i16_val(8)));
-  vi16 = b.and_(vi16, b.i16_val(0x7FFF));
+      b.trunc(i8_ty, b.lshr(b.and_(intVal, srcFpInfo.toLLVMIntValue(signMask)),
+                            srcFpInfo.toLLVMIntValue(srcWidth - 8)));
+
+  int32_t absoluteMask = signMask - 1;
+  intVal = b.and_(intVal, srcFpInfo.toLLVMIntValue(absoluteMask));
 
   // Rounding to nearest even
-  constexpr uint16_t baseRoundingBias = 0x003F; // 1 << (10 - 3 - 1) - 1
+  uint32_t baseRoundingBias = (1 << (reducedMantissaBits - 1)) - 1;
 
-  // S.EEEEE.MMMMMMMMMM => 0.00000.00M0000000 => 0.00000.000000000M
+  // For Fp16, S.EEEEE.MMMMMMMMMM => 0.00000.00M0000000 => 0.00000.000000000M
+  uint32_t mantissaLSB = 1 << reducedMantissaBits;
+  Value mantissaLSBValue = srcFpInfo.toLLVMIntValue(mantissaLSB);
   Value remainingMantissaLSB =
-      b.lshr(b.and_(vi16, b.i16_val(0x0080)), b.i16_val(7));
-  Value roundingBias = b.add(remainingMantissaLSB, b.i16_val(baseRoundingBias));
-  Value vFp8 = b.add(vi16, roundingBias);
+      b.lshr(b.and_(intVal, mantissaLSBValue), reducedMantissaValue);
+  Value roundingBias =
+      b.add(remainingMantissaLSB, srcFpInfo.toLLVMIntValue(baseRoundingBias));
+  Value vFp8 = b.add(intVal, roundingBias);
 
   // Reduce mantissa to 3 bits
-  vFp8 = b.and_(vFp8, b.i16_val(0xFF80)); // 0xFF80 == 1.11111.1110000000
-
-  // 0x2400 is the FP16 representation of 2^{-6}, which is the smallest normal
-  // number in FP8E4M3FN. We round numbers smaller than that to 0x2400 to make
+  // For Fp16, reduceMantissaMask == 1.11111.1110000000
+  uint32_t reduceMantissaMask =
+      ((1 << (1 + srcExponentBits + dstMantissaBits + 1)) - 1)
+      << reducedMantissaBits;
+  Value reduceMantissa = srcFpInfo.toLLVMIntValue(reduceMantissaMask);
+  vFp8 = b.and_(vFp8, reduceMantissa);
+
+  // We round numbers smaller than the minimal normal number in Fp8 to make
   // it easier to handle subnormals
-  vFp8 = b.umax(vFp8, b.i16_val(0x2400));
+  auto dstSmallest = llvm::APFloat::getSmallestNormalized(dstSemantic);
+  // Get the srcFpType representation of the minimal normal number in Fp8
+  bool losesInfo;
+  dstSmallest.convert(*srcSemantic, APFloat::rmNearestTiesToEven, &losesInfo);
+  uint32_t dstMinimal =
+      static_cast<uint32_t>(dstSmallest.bitcastToAPInt().getZExtValue());
+  vFp8 = b.umax(vFp8, srcFpInfo.toLLVMIntValue(dstMinimal));
 
   // Adjust exponent bias
-  vFp8 = b.sub(vFp8, b.i16_val(0x2000)); // (15 - 7) << 10
+  uint32_t expBias = (srcBias - dstBias) << srcMantissaBits;
+  vFp8 = b.sub(vFp8, srcFpInfo.toLLVMIntValue(expBias));
 
   // Shift right and truncate
-  vFp8 = b.trunc(i8_ty, b.lshr(vFp8, b.i16_val(7))); // 10 - 3
-
-  // 0x5F7F == 0.10111.1101111111 is the largest possible normal
-  // number(including infinity) after rounding in FP8
-  //
-  // In saturation mode, numbers larger than the max normal number(including
-  // infinity) in FP8 after rounding will be replaced with max_E4M3, i.e. 0x7E
-  // === 0.1111.110
-  Value isOverflowOrInf = b.icmp_ugt(vi16, b.i16_val(0x5F7F));
-  vFp8 = b.select(isOverflowOrInf, b.i8_val(0x7E), vFp8);
+  vFp8 = b.trunc(i8_ty, b.lshr(vFp8, reducedMantissaValue));
+
+  // Any numbers larger than the max normal number(including infinity) in FP8
+  // after rounding will cause overflow
+  auto dstLargest = llvm::APFloat::getLargest(dstSemantic);
+  uint32_t dstMaxPositive =
+      static_cast<uint32_t>(dstLargest.bitcastToAPInt().getZExtValue());
+  // Get the srcFpType representation of the maximal normal number in Fp8
+  dstLargest.convert(*srcSemantic, APFloat::rmNearestTiesToEven, &losesInfo);
+  uint32_t dstMaxOfSrcType =
+      static_cast<uint32_t>(dstLargest.bitcastToAPInt().getZExtValue());
+
+  // For Fp16, 0x5F7F == 0.10111.1101111111 is the largest possible normal
+  // number(including infinity) after rounding in FP8E4M3
+  if constexpr (std::is_same_v<SrcFPType, Float32Type>)
+    dstMaxOfSrcType |= 0x7ffff;
+  else
+    dstMaxOfSrcType |= 0x7f;
+  Value isOverflowOrInf =
+      b.icmp_ugt(intVal, srcFpInfo.toLLVMIntValue(dstMaxOfSrcType));
+  vFp8 =
+      b.select(isOverflowOrInf, dstFpInfo.toLLVMIntValue(dstMaxPositive), vFp8);
 
   // Round subnormals to nearest even. Ref:
   // https://github.com/openxla/xla/blob/f20c6fe2/xla/service/elemental_ir_emitter.cc#L272
   constexpr size_t lutSize = 8;
-  constexpr float halfwayPointsLUT[lutSize] = {0x1400, 0x1A00, 0x1D00, 0x1F00,
-                                               0x2080, 0x2180, 0x2280, 0x2380};
+  auto dstTyID = TypeID::get<Float8E4M3FNType>();
+  SmallVector<float> halfwayPointsLUT =
+      srcFpInfo.getHalfwayPointsForDstType(dstTyID);
 
   for (int i = lutSize - 1; i >= 0; i--) {
     Value cmp;
     if (i % 2 == 0) {
-      cmp = b.icmp_ule(vi16, b.i16_val(halfwayPointsLUT[i]));
+      cmp = b.icmp_ule(intVal, srcFpInfo.toLLVMIntValue(halfwayPointsLUT[i]));
     } else {
-      cmp = b.icmp_ult(vi16, b.i16_val(halfwayPointsLUT[i]));
+      cmp = b.icmp_ult(intVal, srcFpInfo.toLLVMIntValue(halfwayPointsLUT[i]));
     }
 
     vFp8 = b.select(cmp, b.i8_val(i), vFp8);
   }
 
   // NaN remains NaN after conversion
-  vFp8 = b.select(isNaN, b.i8_val(0x7F), vFp8);
+  int32_t positiveNan = (1 << (dstExponentBits + dstMantissaBits)) - 1;
+  vFp8 = b.select(isNaN, dstFpInfo.toLLVMIntValue(positiveNan), vFp8);
 
   // Set sign bit
   vFp8 = b.or_(vFp8, sign);
 
   return vFp8;
 }
 
+// Fp32 -> OCP Fp8 (RTNZ)
+static SmallVector<Value>
+Fp32_to_Fp8E4M3FN_RTNE_SW(Location loc, ConversionPatternRewriter &rewriter,
+                          const SmallVector<Value> &v) {
+  SmallVector<Value> result(2);
+  result[0] = Fp_to_Fp8E4M3FN_RTNE_oneValue<Float32Type>(loc, rewriter, v[0]);
+  result[1] = Fp_to_Fp8E4M3FN_RTNE_oneValue<Float32Type>(loc, rewriter, v[1]);
+  return result;
+}
+
+// Fp16 -> OCP Fp8 (RTNZ)
 static SmallVector<Value>
 Fp16_to_Fp8E4M3FN_RTNE_SW(Location loc, ConversionPatternRewriter &rewriter,
                           const SmallVector<Value> &v) {
   SmallVector<Value> result(2);
-  result[0] = Fp16_to_Fp8E4M3FN_RTNE_oneValue(loc, rewriter, v[0]);
-  result[1] = Fp16_to_Fp8E4M3FN_RTNE_oneValue(loc, rewriter, v[1]);
+  result[0] = Fp_to_Fp8E4M3FN_RTNE_oneValue<Float16Type>(loc, rewriter, v[0]);
+  result[1] = Fp_to_Fp8E4M3FN_RTNE_oneValue<Float16Type>(loc, rewriter, v[1]);
   return result;
 }
 
@@ -377,14 +509,21 @@ static SmallVector<Value> Fp8E5M2_to_Fp32(Location loc,
 }
 
 // Convert Fp32 to OCP Fp8 on CDNA4
-static SmallVector<Value> Fp32_to_Fp8E4M3FN(Location loc,
-                                            ConversionPatternRewriter &rewriter,
-                                            const SmallVector<Value> &v) {
+
+static SmallVector<Value>
+Fp32_to_Fp8E4M3FN_RTNE_HW(Location loc, ConversionPatternRewriter &rewriter,
+                          const SmallVector<Value> &v) {
   assert(v.size() == 2);
   return cvtScalePkDowncastToFp8<ROCDL::CvtScaleF32PkFp8F32Op>(loc, rewriter,
                                                                v[0], v[1]);
 }
 
+// Fp32 -> OCP Fp8 (RTNE)
+ConverterT Fp32_to_Fp8E4M3FN_RTNE(AMD::ISAFamily isaFamily) {
+  return isaFamily == AMD::ISAFamily::CDNA4 ? Fp32_to_Fp8E4M3FN_RTNE_HW
+                                            : Fp32_to_Fp8E4M3FN_RTNE_SW;
+}
+
 // Fp32 -> OCP Bf8 (RTNE)
 
 static SmallVector<Value>
@@ -1343,7 +1482,8 @@ struct FpToFpOpConversion
              Fp32_to_Fp8E4M3FNUZ},
             {{F32TyID, F8E5M2FNUZTyID, RoundingMode::RTNE},
              Fp32_to_Fp8E5M2FNUZ},
-            {{F32TyID, F8E4M3FNTyID, RoundingMode::RTNE}, Fp32_to_Fp8E4M3FN},
+            {{F32TyID, F8E4M3FNTyID, RoundingMode::RTNE},
+             Fp32_to_Fp8E4M3FN_RTNE(isaFamily)},
             {{F32TyID, F8E5M2TyID, RoundingMode::RTNE},
              Fp32_to_Fp8E5M2_RTNE(isaFamily)},
             {{F32TyID, F8E5M2TyID, RoundingMode::RTZ}, Fp32_to_Fp8E5M2_RTZ},
@@ -1406,8 +1546,8 @@ struct FpToFpOpConversion
     // - fp16 -> fp8 with rtne
     // with the following exceptions:
     // 1. fp32 -> ocp fp8/bf8 on CDNA4: has hardware support
-    // 2. fp32 -> nanoo fp8/bf8 on non-CDNA4: has hardware support
-    // 3. fp32 -> ocp bf8 on non-CDNA4: has software support
+    // 2. fp32 -> nanoo fp8/bf8 on CDNA3: has hardware support
+    // 3. fp32 -> ocp fp8/bf8 on non-CDNA4: has software support
     bool useFP16IntermediateSrc =
         srcElementType.isF32() && !dstElementType.isF16() &&
         roundingMode == RoundingMode::RTNE &&
@@ -1417,7 +1557,7 @@ struct FpToFpOpConversion
           (llvm::isa<Float8E4M3FNUZType, Float8E5M2FNUZType>(
               dstElementType))) &&
         !(isaFamily != AMD::ISAFamily::CDNA4 &&
-          (llvm::isa<Float8E5M2Type>(dstElementType)));
+          (llvm::isa<Float8E5M2Type, Float8E4M3FNType>(dstElementType)));
 
     // fp8/bf8->f32, if neither nanoo fp8/bf8 on CDNA3 nor ocp fp8/bf8 on CDNA4,
     // is done in two steps: fp8/bf8->fp16 and fp16->fp32