[AMD] Fix downcast to float8e5 software emulation (#7361)

borontion · web-flow · commit 20a8ac9945c4 · 2025-06-30T16:33:44.000-07:00
Fix downcast to float8e5 software emulation.

This PR re-writes the following software conversions:

- FP16 -&gt; FP8E5 RTNE
- FP32 -&gt; FP8E5 RTNE
- BF16 -&gt; FP8E5 RTNE

It adds additional steps for conversion to handle subnormal
values and saturation mode.
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -341,9 +341,6 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
 
     if is_hip():
-        if dst_dtype == 'float8e5' and rounding == 'rtne' and not is_hip_cdna4():
-            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on CDNA4")
-
         if dst_dtype == 'float8e4nv':
             if not rounding == 'rtne':
                 pytest.skip("float8e4nv downcast tests only supported with RTNE rounding on AMDGPU")
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -108,40 +108,48 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter,
 
 // Fp16 -> OCP Bf8 (RTNE)
 
-// FP8E5M2 is the open-compute standard FP8E5M2 format. NVIDIA GPU supports it
-// natively but we don't have hardware native support on CDNA3.
-//
-// The SW based downcast with RTNE is not fully functional for the denorm
-// values. We need rewrite it if we need to emulate this data type on AMDGPU.
 static SmallVector<Value>
 Fp16_to_Fp8E5M2_RTNE_SW(Location loc, ConversionPatternRewriter &rewriter,
                         const SmallVector<Value> &v) {
+  assert(v.size() == 4);
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  auto fp16x2VecTy = vec_ty(f16_ty, 2);
-  Value fp16x2Vec0 = b.undef(fp16x2VecTy);
-  Value fp16x2Vec1 = b.undef(fp16x2VecTy);
-  fp16x2Vec0 = b.insert_element(fp16x2VecTy, fp16x2Vec0, v[0], b.i32_val(0));
-  fp16x2Vec0 = b.insert_element(fp16x2VecTy, fp16x2Vec0, v[1], b.i32_val(1));
-  fp16x2Vec1 = b.insert_element(fp16x2VecTy, fp16x2Vec1, v[2], b.i32_val(0));
-  fp16x2Vec1 = b.insert_element(fp16x2VecTy, fp16x2Vec1, v[3], b.i32_val(1));
-
-  Value a0 = b.bitcast(fp16x2Vec0, i32_ty);
-  Value a1 = b.bitcast(fp16x2Vec1, i32_ty);
 
-  a0 = b.and_(i32_ty, a0, b.i32_val(0xfffefffe));
-  a1 = b.and_(i32_ty, a1, b.i32_val(0xfffefffe));
-
-  a0 = b.add(i32_ty, a0, b.i32_val(0x00800080));
-  a1 = b.add(i32_ty, a1, b.i32_val(0x00800080));
-
-  auto fp8x4VecTy = vec_ty(i8_ty, 4);
-  a0 = b.bitcast(a0, fp8x4VecTy);
-  a1 = b.bitcast(a1, fp8x4VecTy);
+  SmallVector<Value> result(4);
+  for (size_t i = 0; i < 4; ++i) {
+    Value fp16 = v[i];
+    Value i16 = b.bitcast(fp16, i16_ty);
+
+    Value s = b.and_(i16_ty, i16, b.i16_val(0x8000));
+    Value exp =
+        b.and_(i16_ty, b.lshr(i16_ty, i16, b.i16_val(10)), b.i16_val(0x1F));
+    Value man = b.and_(i16_ty, i16, b.i16_val(0x03FF));
+    Value sig = b.and_(i16_ty, i16, b.i16_val(0x7FFF));
+
+    // Round 10-bit mantissa to 2-bit nearest, ties to even
+    Value bias = b.add(
+        i16_ty,
+        b.lshr(i16_ty, b.and_(i16_ty, sig, b.i16_val(0x0100)), b.i16_val(8)),
+        b.i16_val(0x007F));
+    i16 = b.add(i16_ty, sig, bias);
+
+    // Handle overflow using saturation mode, by setting sig to be the max.
+    // Any number equal or larger than 0x7B80 after rounding (including
+    // infinite 0x7C00) will cause overflow
+    i16 = b.select(b.icmp_uge(sig, b.i16_val(0x7B80)), b.i16_val(0x7B00), i16);
+
+    // Handle NaN value by keeping it Nan
+    i16 = b.select(
+        b.and_(b.icmp_eq(exp, b.i16_val(0x1F)), b.icmp_ne(man, b.i16_val(0x0))),
+        b.i16_val(0x7E00), i16);
+
+    // Add sign bit
+    i16 = b.or_(i16_ty, s, i16);
+
+    // Truncate to 8-bit
+    result[i] = b.trunc(i8_ty, b.lshr(i16_ty, i16, b.i16_val(8)));
+  }
 
-  return {b.extract_element(i8_ty, a0, b.i32_val(1)),
-          b.extract_element(i8_ty, a0, b.i32_val(3)),
-          b.extract_element(i8_ty, a1, b.i32_val(1)),
-          b.extract_element(i8_ty, a1, b.i32_val(3))};
+  return result;
 }
 
 static SmallVector<Value>
@@ -377,15 +385,98 @@ static SmallVector<Value> Fp32_to_Fp8E4M3FN(Location loc,
                                                                v[0], v[1]);
 }
 
-// Convert Fp32 to OCP Bf8 on CDNA4
+// Fp32 -> OCP Bf8 (RTNE)
+
 static SmallVector<Value>
-Fp32_to_Fp8E5M2_RTNE(Location loc, ConversionPatternRewriter &rewriter,
-                     const SmallVector<Value> &v) {
+Fp32_to_Fp8E5M2_RTNE_SW(Location loc, ConversionPatternRewriter &rewriter,
+                        const SmallVector<Value> &v) {
+  assert(v.size() == 4);
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+
+  SmallVector<Value> result(4);
+  for (size_t i = 0; i < 4; ++i) {
+    Value fp32 = v[i];
+    Value i32 = b.bitcast(fp32, i32_ty);
+
+    Value s = b.and_(i32_ty, i32, b.i32_val(0x80000000));
+    Value exp =
+        b.and_(i32_ty, b.lshr(i32_ty, i32, b.i32_val(23)), b.i32_val(0xFF));
+    Value man = b.and_(i32_ty, i32, b.i32_val(0x007FFFFF));
+
+    // Convert 8-bit exponent to 5-bit
+    Value exp5 = b.select(b.icmp_ult(exp, b.i32_val(0x71)), b.i32_val(0),
+                          b.sub(i32_ty, exp, b.i32_val(0x70)));
+
+    // Handle subnormal values (exp5 = 0)
+    // - exp <  0x6e: mantissa = 0x00000000 (0)
+    // - exp == 0x6e: mantissa = 0x00000000 (0),
+    //                           0x00200000 (1/4)
+    // - exp == 0x6f: mantissa = 0x00200000 (1/4),
+    //                           0x00400000 (1/2)
+    // - exp == 0x70: mantissa = 0x00400000 (1/2),
+    //                           0x00600000 (3/4),
+    //                           0x00800000 (1)
+    man = b.select(b.icmp_ult(exp, b.i32_val(0x6e)), b.i32_val(0), man);
+    man = b.select(b.icmp_eq(exp, b.i32_val(0x6e)),
+                   b.select(b.icmp_ne(man, b.i32_val(0)), b.i32_val(0x00200000),
+                            b.i32_val(0)),
+                   man);
+    man = b.select(b.icmp_eq(exp, b.i32_val(0x6f)),
+                   b.select(b.icmp_uge(man, b.i32_val(0x00400000)),
+                            b.i32_val(0x00400000), b.i32_val(0x00200000)),
+                   man);
+    man = b.select(
+        b.icmp_eq(exp, b.i32_val(0x70)),
+        b.select(b.icmp_ugt(man, b.i32_val(0x00200000)),
+                 b.select(b.icmp_uge(man, b.i32_val(0x00600000)),
+                          b.i32_val(0x00800000), b.i32_val(0x00600000)),
+                 b.i32_val(0x00400000)),
+        man);
+
+    // Round 23-bit mantissa to 2-bit nearest, ties to even
+    Value sig = b.or_(i32_ty, b.shl(i32_ty, exp5, b.i32_val(23)), man);
+    Value bias =
+        b.add(i32_ty,
+              b.lshr(i32_ty, b.and_(i32_ty, sig, b.i32_val(0x00200000)),
+                     b.i32_val(21)),
+              b.i32_val(0x000FFFFF));
+    i32 = b.add(i32_ty, sig, bias);
+
+    // Handle overflow using saturation mode, by setting sig to be the max.
+    // Overflow will happe for the following cases:
+    // - Any number equal or larger than 0x0F700000 after rounding
+    // - Exponent larged than 0x8E (including infinite 0xFF)
+    i32 = b.select(b.or_(b.icmp_ugt(exp, b.i32_val(0x8E)),
+                         b.icmp_uge(sig, b.i32_val(0x0F700000))),
+                   b.i32_val(0x0F7FFFFF), i32);
+
+    // Handle NaN value by keeping it Nan
+    i32 = b.select(
+        b.and_(b.icmp_eq(exp, b.i32_val(0xFF)), b.icmp_ne(man, b.i32_val(0x0))),
+        b.i32_val(0x0FC00000), i32);
+
+    // Add sign bit
+    i32 = b.or_(i32_ty, b.lshr(i32_ty, s, b.i32_val(3)), i32);
+
+    // Truncate to 8-bit
+    result[i] = b.trunc(i8_ty, b.lshr(i32_ty, i32, b.i32_val(21)));
+  }
+  return result;
+}
+
+static SmallVector<Value>
+Fp32_to_Fp8E5M2_RTNE_HW(Location loc, ConversionPatternRewriter &rewriter,
+                        const SmallVector<Value> &v) {
   assert(v.size() == 2);
   return cvtScalePkDowncastToFp8<ROCDL::CvtScaleF32PkBf8F32Op>(loc, rewriter,
                                                                v[0], v[1]);
 }
 
+ConverterT Fp32_to_Fp8E5M2_RTNE(AMD::ISAFamily isaFamily) {
+  return isaFamily == AMD::ISAFamily::CDNA4 ? Fp32_to_Fp8E5M2_RTNE_HW
+                                            : Fp32_to_Fp8E5M2_RTNE_SW;
+}
+
 // Fp32 -> Nanoo Bf8 on CDNA3
 static SmallVector<Value>
 Fp32_to_Fp8E5M2FNUZ(Location loc, ConversionPatternRewriter &rewriter,
@@ -853,86 +944,77 @@ ConverterT Fp8E5M2_to_Bf16(AMD::ISAFamily isaFamily) {
 static SmallVector<Value>
 Bf16_to_Fp8E5M2_SW(Location loc, ConversionPatternRewriter &rewriter,
                    const SmallVector<Value> &v) {
+  assert(v.size() == 4);
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  auto bf16x2VecTy = vec_ty(bf16_ty, 2);
-  Value bf16x2Vec0 = b.undef(bf16x2VecTy);
-  Value bf16x2Vec1 = b.undef(bf16x2VecTy);
-  bf16x2Vec0 = b.insert_element(bf16x2VecTy, bf16x2Vec0, v[0], b.i32_val(0));
-  bf16x2Vec0 = b.insert_element(bf16x2VecTy, bf16x2Vec0, v[1], b.i32_val(1));
-  bf16x2Vec1 = b.insert_element(bf16x2VecTy, bf16x2Vec1, v[2], b.i32_val(0));
-  bf16x2Vec1 = b.insert_element(bf16x2VecTy, bf16x2Vec1, v[3], b.i32_val(1));
-  bf16x2Vec0 = b.bitcast(bf16x2Vec0, i32_ty);
-  bf16x2Vec1 = b.bitcast(bf16x2Vec1, i32_ty);
-
-  Value sign0 = b.and_(i32_ty, bf16x2Vec0, b.i32_val(0x80008000));
-  Value sign1 = b.and_(i32_ty, bf16x2Vec1, b.i32_val(0x80008000));
-  auto fp8x4VecTy = vec_ty(i8_ty, 4);
-  Value sign = b.undef(fp8x4VecTy);
-  sign0 = b.bitcast(sign0, fp8x4VecTy);
-  sign1 = b.bitcast(sign1, fp8x4VecTy);
-  sign = b.insert_element(fp8x4VecTy, sign,
-                          b.extract_element(i8_ty, sign0, b.i32_val(1)),
-                          b.i32_val(0));
-  sign = b.insert_element(fp8x4VecTy, sign,
-                          b.extract_element(i8_ty, sign0, b.i32_val(3)),
-                          b.i32_val(1));
-  sign = b.insert_element(fp8x4VecTy, sign,
-                          b.extract_element(i8_ty, sign1, b.i32_val(1)),
-                          b.i32_val(2));
-  sign = b.insert_element(fp8x4VecTy, sign,
-                          b.extract_element(i8_ty, sign1, b.i32_val(3)),
-                          b.i32_val(3));
-  sign = b.bitcast(sign, i32_ty);
-
-  Value nosign0 = b.and_(i32_ty, bf16x2Vec0, b.i32_val(0x7fff7fff));
-  Value nosign1 = b.and_(i32_ty, bf16x2Vec1, b.i32_val(0x7fff7fff));
-
-  Value nosign_0_0 = b.and_(i32_ty, nosign0, b.i32_val(0xffff0000));
-  nosign_0_0 = b.umax(i32_ty, nosign_0_0, b.i32_val(0x38000000));
-  nosign_0_0 = b.umin(i32_ty, nosign_0_0, b.i32_val(0x57e00000));
-  Value nosign_0_1 = b.and_(i32_ty, nosign0, b.i32_val(0x0000ffff));
-  nosign_0_1 = b.umax(i32_ty, nosign_0_1, b.i32_val(0x3800));
-  nosign_0_1 = b.umin(i32_ty, nosign_0_1, b.i32_val(0x57e0));
-  nosign0 = b.or_(i32_ty, nosign_0_0, nosign_0_1);
-
-  Value nosign_1_0 = b.and_(i32_ty, nosign1, b.i32_val(0xffff0000));
-  nosign_1_0 = b.umax(i32_ty, nosign_1_0, b.i32_val(0x38000000));
-  nosign_1_0 = b.umin(i32_ty, nosign_1_0, b.i32_val(0x57e00000));
-  Value nosign_1_1 = b.and_(i32_ty, nosign1, b.i32_val(0x0000ffff));
-  nosign_1_1 = b.umax(i32_ty, nosign_1_1, b.i32_val(0x3800));
-  nosign_1_1 = b.umin(i32_ty, nosign_1_1, b.i32_val(0x57e0));
-  nosign1 = b.or_(i32_ty, nosign_1_0, nosign_1_1);
-
-  nosign0 = b.add(i32_ty, nosign0, b.i32_val(0x00100010));
-  nosign1 = b.add(i32_ty, nosign1, b.i32_val(0x00100010));
-  nosign0 = b.sub(i32_ty, nosign0, b.i32_val(0x38003800));
-  nosign1 = b.sub(i32_ty, nosign1, b.i32_val(0x38003800));
-  nosign0 = b.shl(i32_ty, nosign0, b.i32_val(3));
-  nosign1 = b.shl(i32_ty, nosign1, b.i32_val(3));
-
-  nosign0 = b.bitcast(nosign0, fp8x4VecTy);
-  nosign1 = b.bitcast(nosign1, fp8x4VecTy);
-  Value nosign = b.undef(fp8x4VecTy);
-  nosign = b.insert_element(fp8x4VecTy, nosign,
-                            b.extract_element(i8_ty, nosign0, b.i32_val(1)),
-                            b.i32_val(0));
-  nosign = b.insert_element(fp8x4VecTy, nosign,
-                            b.extract_element(i8_ty, nosign0, b.i32_val(3)),
-                            b.i32_val(1));
-  nosign = b.insert_element(fp8x4VecTy, nosign,
-                            b.extract_element(i8_ty, nosign1, b.i32_val(1)),
-                            b.i32_val(2));
-  nosign = b.insert_element(fp8x4VecTy, nosign,
-                            b.extract_element(i8_ty, nosign1, b.i32_val(3)),
-                            b.i32_val(3));
-  nosign = b.bitcast(nosign, i32_ty);
-
-  Value fp8x4Vec = b.or_(i32_ty, nosign, sign);
-  fp8x4Vec = b.bitcast(fp8x4Vec, fp8x4VecTy);
-  return {b.extract_element(i8_ty, fp8x4Vec, b.i32_val(0)),
-          b.extract_element(i8_ty, fp8x4Vec, b.i32_val(1)),
-          b.extract_element(i8_ty, fp8x4Vec, b.i32_val(2)),
-          b.extract_element(i8_ty, fp8x4Vec, b.i32_val(3))};
+
+  SmallVector<Value> result(4);
+  for (size_t i = 0; i < 4; ++i) {
+    Value fp16 = v[i];
+    Value i16 = b.bitcast(fp16, i16_ty);
+
+    Value s = b.and_(i16_ty, i16, b.i16_val(0x8000));
+    Value exp =
+        b.and_(i16_ty, b.lshr(i16_ty, i16, b.i16_val(7)), b.i16_val(0xFF));
+    Value man = b.and_(i16_ty, i16, b.i16_val(0x7F));
+
+    // Convert 8-bit exponent to 5-bit exponent
+    Value exp5 = b.select(b.icmp_ult(exp, b.i16_val(0x71)), b.i16_val(0),
+                          b.sub(i16_ty, exp, b.i16_val(0x70)));
+
+    // Handle subnormal values (exp5 = 0)
+    // - exp <  0x6e: mantissa = 0x0000 (0)
+    // - exp == 0x6e: mantissa = 0x0000 (0),
+    //                           0x0020 (1/4)
+    // - exp == 0x6f: mantissa = 0x0020 (1/4),
+    //                           0x0040 (1/2)
+    // - exp == 0x70: mantissa = 0x0040 (1/2),
+    //                           0x0060 (3/4),
+    //                           0x0080 (1)
+    man = b.select(b.icmp_ult(exp, b.i16_val(0x6e)), b.i16_val(0), man);
+    man = b.select(
+        b.icmp_eq(exp, b.i16_val(0x6e)),
+        b.select(b.icmp_ne(man, b.i16_val(0)), b.i16_val(0x0020), b.i16_val(0)),
+        man);
+    man = b.select(b.icmp_eq(exp, b.i16_val(0x6f)),
+                   b.select(b.icmp_uge(man, b.i16_val(0x0040)),
+                            b.i16_val(0x0040), b.i16_val(0x0020)),
+                   man);
+    man = b.select(b.icmp_eq(exp, b.i16_val(0x70)),
+                   b.select(b.icmp_ugt(man, b.i16_val(0x0020)),
+                            b.select(b.icmp_uge(man, b.i16_val(0x0060)),
+                                     b.i16_val(0x0080), b.i16_val(0x0060)),
+                            b.i16_val(0x0040)),
+                   man);
+
+    // Round 7-bit mantissa to 2-bit
+    Value sig = b.or_(i16_ty, b.shl(i16_ty, exp5, b.i16_val(7)), man);
+    Value bias = b.add(
+        i16_ty,
+        b.lshr(i16_ty, b.and_(i16_ty, sig, b.i16_val(0x0020)), b.i16_val(5)),
+        b.i16_val(0x000F));
+    i16 = b.add(i16_ty, sig, bias);
+
+    // Handle overflow using saturation mode, by setting sig to be the max.
+    // Overflow will happe for the following cases:
+    // - Any number equal or larger than 0x0F70 after rounding
+    // - Exponent larged than 0x8E (including infinite 0xFF)
+    i16 = b.select(b.or_(b.icmp_ugt(exp, b.i16_val(0x8E)),
+                         b.icmp_uge(sig, b.i16_val(0x0F70))),
+                   b.i16_val(0x0F7F), i16);
+
+    // Handle NaN value by keeping it Nan
+    i16 = b.select(
+        b.and_(b.icmp_eq(exp, b.i16_val(0xFF)), b.icmp_ne(man, b.i16_val(0x0))),
+        b.i16_val(0x0FC0), i16);
+
+    // Add sign bit
+    i16 = b.or_(i16_ty, b.lshr(i16_ty, s, b.i16_val(3)), i16);
+
+    // Truncate to 8-bit
+    result[i] = b.trunc(i8_ty, b.lshr(i16_ty, i16, b.i16_val(5)));
+  }
+
+  return result;
 }
 
 static SmallVector<Value>
@@ -1262,7 +1344,8 @@ struct FpToFpOpConversion
             {{F32TyID, F8E5M2FNUZTyID, RoundingMode::RTNE},
              Fp32_to_Fp8E5M2FNUZ},
             {{F32TyID, F8E4M3FNTyID, RoundingMode::RTNE}, Fp32_to_Fp8E4M3FN},
-            {{F32TyID, F8E5M2TyID, RoundingMode::RTNE}, Fp32_to_Fp8E5M2_RTNE},
+            {{F32TyID, F8E5M2TyID, RoundingMode::RTNE},
+             Fp32_to_Fp8E5M2_RTNE(isaFamily)},
             {{F32TyID, F8E5M2TyID, RoundingMode::RTZ}, Fp32_to_Fp8E5M2_RTZ},
             {{F8E4M3FNUZTyID, F32TyID, undefRounding}, Fp8E4M3FNUZ_to_Fp32},
             {{F8E5M2FNUZTyID, F32TyID, undefRounding}, Fp8E5M2FNUZ_to_Fp32},
@@ -1318,16 +1401,23 @@ struct FpToFpOpConversion
       numElements = 4;
     }
 
-    // f32->fp8/bf8 with rtne, if neither nanoo fp8/bf8 on CDNA3 nor ocp fp8/bf8
-    // on CDNA4, is done in two steps: f32->fp16 with rtne and fp16->fp8/bf8
-    // with rtne
+    // fp32 -> fp8 with rtne can be done in two steps:
+    // - fp32 -> fp16 with rtne and
+    // - fp16 -> fp8 with rtne
+    // with the following exceptions:
+    // 1. fp32 -> ocp fp8/bf8 on CDNA4: has hardware support
+    // 2. fp32 -> nanoo fp8/bf8 on non-CDNA4: has hardware support
+    // 3. fp32 -> ocp bf8 on non-CDNA4: has software support
     bool useFP16IntermediateSrc =
         srcElementType.isF32() && !dstElementType.isF16() &&
         roundingMode == RoundingMode::RTNE &&
         !(isaFamily == AMD::ISAFamily::CDNA4 &&
           (llvm::isa<Float8E4M3FNType, Float8E5M2Type>(dstElementType))) &&
         !(isaFamily == AMD::ISAFamily::CDNA3 &&
-          (llvm::isa<Float8E4M3FNUZType, Float8E5M2FNUZType>(dstElementType)));
+          (llvm::isa<Float8E4M3FNUZType, Float8E5M2FNUZType>(
+              dstElementType))) &&
+        !(isaFamily != AMD::ISAFamily::CDNA4 &&
+          (llvm::isa<Float8E5M2Type>(dstElementType)));
 
     // fp8/bf8->f32, if neither nanoo fp8/bf8 on CDNA3 nor ocp fp8/bf8 on CDNA4,
     // is done in two steps: fp8/bf8->fp16 and fp16->fp32