[AMD] Clamp Results in Downcasting to FP8E4M3 and FP8E5M2 (#7337)

knwng · ravil-mobile · web-flow · commit ddacd46ada9f · 2025-06-27T14:39:42.000-07:00
There are several conversion ops on the NV side using `satfinite` mode,
but on the AMD side, some of those are in non-saturation mode. We need
to align AMD ops with NV.

For example, fp32 to OCP fp8 on mi350 is lowered to
`ROCDL::CvtScaleF32PkFp8F32Op`, and is eventually lowered to
`v_cvt_scalef32_pk_fp8_f32`, which, according to ISA, is in
non-saturation mode. But on the NV side, it's lowered to
`cvt.rn.satfinite.e4m3x2.f32`, which is in saturation mode.

Other examples including:

| Conversion | ROCDL dialect | Instruction |
| ----------------- | ----------------------------- |
-------------------------- |
| fp32 to fp8e4m3fn | ROCDL::CvtScaleF32PkFp8F32Op |
v_cvt_scalef32_pk_fp8_f32 |
| fp32 to fp8e5m2 | ROCDL::CvtScaleF32PkBf8F32Op |
v_cvt_scalef32_pk_bf8_f32 |
| fp16 to fp8e4m3fn | ROCDL::CvtScaleF32PkFp8F16Op |
v_cvt_scalef32_pk_fp8_f16 |
| fp16 to fp8e5m2 | ROCDL::CvtScaleF32PkBf8F16Op |
v_cvt_scalef32_pk_bf8_f16 |
| bf16 to fp8e4m3fn | ROCDL::CvtScaleF32PkFp8Bf16Op |
v_cvt_scalef32_pk_fp8_bf16 |
| bf16 to fp8e5m2 | ROCDL::CvtScaleF32PkBf8Bf16Op |
v_cvt_scalef32_pk_bf8_bf16 |

This PR fixed this issue by enabling the `FP16_OVFL` flag in the Mode
register before these conversion instrs.

---------

Co-authored-by: ravil-mobile &lt;ravil.aviva.com@gmail.com&gt;
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -7,7 +7,7 @@
 import triton
 import triton.language as tl
 
-from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3, is_hip_cdna4
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna2, is_hip_cdna3, is_hip_cdna4
 
 
 def matching_int(dtype):
@@ -366,3 +366,74 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
 
     for i in range(256):
         downcast_test(getattr(tl, src_dtype), getattr(tl, dst_dtype), rounding, *stuff, max_repr, i, device=device)
+
+@pytest.mark.parametrize("mode", [
+    'max', 'min', 'inf', '-inf', 'nan',
+])
+@pytest.mark.parametrize("dst_dtype", ["float8e4nv", "float8e5"])
+@pytest.mark.parametrize("src_dtype", ["float32", "float16", "bfloat16"])
+def test_typeconvert_downcast_clamping(src_dtype, dst_dtype, mode, rounding="rtne", device="cuda"):
+    if is_cuda():
+        if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
+            pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
+
+        if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
+            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
+    elif is_hip():
+        if is_hip_cdna2():
+            pytest.skip(f"{dst_dtype} downcast to {dst_dtype} with clamping is not fully tested on AMDGPU CDNA2")
+
+        if is_hip_cdna3():
+            if src_dtype == 'bfloat16' and dst_dtype == 'float8e4nv':
+                pytest.skip(f"{src_dtype} downcast to {dst_dtype} with clamping is not fully tested on AMDGPU CDNA3")
+            if dst_dtype == 'float8e5' and mode in ('inf', '-inf'):
+                pytest.skip(f"Downcast to {dst_dtype} with clamping for `inf` or `-inf` "
+                             "is not fully tested on AMDGPU CDNA3")
+
+    converter = {
+        tl.float8e4nv: torch.float8_e4m3fn,
+        tl.float8e5: torch.float8_e5m2,
+        tl.float16: torch.float16,
+        tl.bfloat16: torch.bfloat16,
+        tl.float32: torch.float32
+    }
+
+    tl_src_dtype = getattr(tl, src_dtype)
+    tl_dst_dtype = getattr(tl, dst_dtype)
+
+    torch_src_dtype = converter[tl_src_dtype]
+    torch_dst_dtype = converter[tl_dst_dtype]
+
+    if mode in ('max', 'min'):
+        # Added to input to exceed the representation range to produce NaN
+        exceed_value = 100.0
+        test_value = torch.finfo(torch_dst_dtype).max + exceed_value
+        expected_result = torch.finfo(torch_dst_dtype).max
+    elif mode in ('inf', '-inf'):
+        test_value = torch.inf
+        expected_result = torch.finfo(torch_dst_dtype).max
+    else:
+        assert mode == 'nan'
+        test_value = torch.nan
+        expected_result = torch.nan
+
+    if mode in ('min', '-inf'):
+        test_value *= -1.0
+        expected_result *= -1.0
+
+    BLOCK_SIZE = 1024
+    shape = (BLOCK_SIZE * 2,)
+    src = torch.full(shape, test_value, dtype=torch_src_dtype, device=device)
+    dst = torch.empty(shape, dtype=torch_dst_dtype, device=device)
+
+    type_convert_triton[(src.shape[0] // BLOCK_SIZE,)](
+        triton.reinterpret(src, torch_src_dtype),
+        triton.reinterpret(dst, torch_dst_dtype),
+        rounding,
+        BLOCK_SIZE
+    )
+
+    if mode == 'nan':
+        assert(torch.all(torch.isnan(dst)))
+    else:
+        torch.testing.assert_close(dst, torch.full_like(dst, expected_result))
diff --git a/python/triton_kernels/tests/test_mxfp.py b/python/triton_kernels/tests/test_mxfp.py
@@ -146,8 +146,8 @@ def test_mxfp_casting(
     if is_hip():
         if swizzle_value is not None or swizzle_scale is not None:
             pytest.skip("Other swizzling patterns are not supported by AMD GPU")
-        if quant_dtype == 'float8_e4m3fn':
-            pytest.skip("float8_e4m3fn cast hasn't been fully tested on AMD GPU")
+        if quant_dtype == 'float8_e4m3fn' and is_hip_cdna3():
+            pytest.skip("float8_e4m3fn cast hasn't been fully tested on AMD CDNA3")
         if quant_dtype == 'float8_e5m2' and is_hip_cdna3():
             pytest.skip("float8_e5m2 cast hasn't been fully tested on AMD CDNA3")
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -65,6 +65,17 @@ static SmallVector<Value>
 cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter,
                         Value v0, Value v1) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
+
+  // This is the location of the fp16_ovfl flag in the Mode register. It's
+  // calculated following this formula:
+  //     (mode register ID = 1) | (Offset << 6) | ((Width - 1) << 11)
+  // In this case, Offset = 23 and Width = 1.
+  // When the bit is 0/1, the conversion from fp32/fp16/bf16 to fp8/bf8 is in
+  // non-saturation/saturation mode.
+  Value fp16OVFLModeRegLoc = b.i32_val(1473);
+  LLVM::createLLVMIntrinsicCallOp(rewriter, loc, "llvm.amdgcn.s.setreg", {},
+                                  {fp16OVFLModeRegLoc, b.i32_val(1)});
+
   Type v2I16Ty = vec_ty(i16_ty, 2);
   Value v2I16Vec = b.undef(v2I16Ty);
   Value scale = b.f32_val(1);
@@ -84,6 +95,7 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter,
     result = rewriter.create<ConvertOp>(loc, v2I16Ty, v2I16Vec, srcVec, scale,
                                         /*dstLoHiSel=*/false);
   }
+
   auto fp8x4VecTy = vec_ty(i8_ty, 4);
   auto fp8x4Vec = b.bitcast(result, fp8x4VecTy);
   SmallVector<Value> ret(2);