[AMD] Emulate fp8 *UZ variants on non-gfx942 architectures (#7401)

PMylon · antiagainst · web-flow · commit 1053fca16260 · 2025-07-12T20:02:51.000Z
This commit emulates fp8 *UZ variants for non-gfx942
architectures. This makes it easier to support workload
targeting gfx942 on other generations.

---------

Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/python/test/unit/language/test_compile_errors.py b/python/test/unit/language/test_compile_errors.py
@@ -7,7 +7,7 @@
 import triton.language as tl
 from triton.compiler.errors import CompilationError, CompileTimeAssertionFailure
 import traceback
-from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna4
 
 
 def format_exception(type, value, tb):
@@ -364,17 +364,21 @@ def test_fp8_support(fresh_triton_cache, dtype):
         if cc >= (8, 9):
             supported_dtypes.append(tl.float8e4nv)
     elif is_hip():
-        supported_dtypes.append(tl.float8e4nv)
-        if is_hip_cdna3():
-            supported_dtypes += [tl.float8e4b8, tl.float8e5b16]
+        supported_dtypes += [tl.float8e4nv, tl.float8e4b8, tl.float8e5b16]
+        if is_hip_cdna4():
+            warning_dtypes += [tl.float8e4b8, tl.float8e5b16]
 
     @triton.jit
     def dtype_kernel(dtype: tl.constexpr):
         a = tl.full((64, 64), 0.0, dtype)
         tl.dot(a, a)
 
     if dtype in warning_dtypes:
-        ctx = pytest.warns(UserWarning, match=r"the use of fp8e4b15 is deprecated on Hopper and later architectures")
+        if is_cuda():
+            ctx = pytest.warns(UserWarning,
+                               match=r"the use of fp8e4b15 is deprecated on Hopper and later architectures")
+        elif is_hip_cdna4():
+            ctx = pytest.warns(UserWarning, match=r"AMD gfx942 specific and not supported on gfx950")
     elif dtype in supported_dtypes:
         ctx = contextlib.nullcontext()
     else:
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -7,7 +7,7 @@
 import triton
 import triton.language as tl
 
-from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3, is_hip_cdna4
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna2, is_hip_cdna3, is_hip_cdna4
 
 
 def matching_int(dtype):
@@ -265,6 +265,7 @@ def upcast_test(src_dtype, dst_dtype, exponent_bits, mantissa_bits, exponent_bia
     ('float8e4nv', 'float32'),
 
     ('float8e4b8', 'float32'),
+    ('float8e4b8', 'bfloat16'),
     ('float8e4b8', 'float16'),
 
     ('float8e5b16', 'float32'),
@@ -284,12 +285,13 @@ def test_typeconvert_upcast(src_dtype, dst_dtype, device):
     elif is_hip():
         if  (src_dtype == 'float8e4nv' and not (is_hip_cdna3() or is_hip_cdna4())):
             pytest.skip(f"upcasting {src_dtype} to {dst_dtype} not supported in this architecture")
-        if  (src_dtype in ('float8e4b15') or
-            (src_dtype in ('float8e4b8', 'float8e5b16') and not is_hip_cdna3())):
+        if  src_dtype == 'float8e4b15':
             # If the dtype should error out in the given device, we assert that and return
             with pytest.raises(triton.CompilationError, match="not supported in this architecture"):
                 launch_exhaustive_populate(getattr(tl, src_dtype), 0, 65536, False, 8, 0x7f, device=device)
             return
+        if src_dtype in ('float8e4b8', 'float8e5b16') and is_hip_cdna2():
+            pytest.skip(f"{src_dtype} is not supported on AMDGPU CDNA2")
 
     # dtype : (exponent_bits, mantissa_bits, exponent_bias, max_repr)
     stuff = {
@@ -341,8 +343,8 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
 
     if is_hip():
-        if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne' and not is_hip_cdna3():
-            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
+        if dst_dtype in ('float8e4b8', 'float8e5b16') and is_hip_cdna2():
+            pytest.skip(f"{dst_dtype} is not supported on AMDGPU CDNA2")
 
     # dtype : (exponent_bits, mantissa_bits, exponent_bias)
     stuff = {
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1487,6 +1487,18 @@ def dot(self, lhs: TensorTy, rhs: TensorTy, acc: TensorTy, input_precision: Opti
             lhs = self.cast(lhs, tl.float16)
             rhs = self.cast(rhs, tl.float16)
 
+        uses_fp8e4b8 = lhs.dtype.is_fp8e4b8() or rhs.dtype.is_fp8e4b8()
+        uses_fp8e5b16 = lhs.dtype.is_fp8e5b16() or rhs.dtype.is_fp8e5b16()
+        if uses_fp8e4b8 or uses_fp8e5b16:
+            type_name = "fp8e4b8" if uses_fp8e4b8 else "fp8e5b16"
+            if type_name in self.builder.options.deprecated_fp8_dot_operand_dtypes:
+                arch = self.builder.options.arch
+                warnings.warn(
+                    f"{type_name} is AMD gfx942 specific and not supported on {arch} so it's upcasted to fp16 and can cause significant slow down. "
+                    f"Please use OCP fp8 variants on {arch} for performance")
+                lhs = self.cast(lhs, tl.float16)
+                rhs = self.cast(rhs, tl.float16)
+
         if input_precision is None:
             input_precision = self.builder.options.default_dot_input_precision
 
diff --git a/test/Conversion/amd/fp_to_fp.mlir b/test/Conversion/amd/fp_to_fp.mlir
@@ -90,7 +90,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func @downcast_to_bf8(%arg0: tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>) {
     // GFX942-COUNT-4: rocdl.cvt.pk.bf8.f32
-    // GFX950-COUNT-4: llvm.fptrunc %{{.+}} : vector<2xf32> to vector<2xf16>
+    // GFX950-COUNT-16: llvm.trunc %{{.+}} : i32 to i8
     %6 = tt.fp_to_fp %arg0, rounding = rtne : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf8E5M2FNUZ, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
     tt.return
   }
@@ -103,7 +103,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func @f32_to_f8(%arg0: tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>) {
     // GFX942-COUNT-4: rocdl.cvt.pk.fp8.f32
-    // GFX950-COUNT-4: llvm.fptrunc %{{.+}} : vector<2xf32> to vector<2xf16>
+    // GFX950-COUNT-16: llvm.trunc %{{.+}} : i32 to i8
     %7 = tt.fp_to_fp %arg0, rounding = rtne : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf8E4M3FNUZ, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
     tt.return
   }
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -37,9 +37,11 @@ class HIPOptions:
     debug: bool = False
     sanitize_overflow: bool = True
     arch: str = None
-    # We have native support for OCP fp8 variants since CNDA4/RDNA4. For earlier generations,
+    # We have native support for OCP fp8 variants since CDNA4/RDNA4. For earlier generations,
     # we software emulate the support for them.
-    supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5")
+    # UZ fp8 variants (fp8e4b8 and fp8e5b16) are natively supported for CDNA3. For other
+    # architectures they are software emulated.
+    supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5", "fp8e5b16", "fp8e4b8")
     deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
     default_dot_input_precision: str = "ieee"
     allowed_dot_input_precisions: Tuple[str] = ("ieee", )
@@ -109,11 +111,12 @@ def parse_options(self, opts) -> Any:
             args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
 
         if "supported_fp8_dtypes" not in opts:
-            supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
-            if self.target.arch == 'gfx942':
-                # CDNA3/gfx942 has native support for AMD specific FP8 types.
-                supported_fp8_dtypes.update({'fp8e4b8', 'fp8e5b16'})
-            args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
+            args["supported_fp8_dtypes"] = tuple(sorted(HIPOptions.supported_fp8_dtypes))
+
+        if self.target.arch == 'gfx950':
+            deprecated_fp8_dot_operand_dtypes = set(HIPOptions.deprecated_fp8_dot_operand_dtypes)
+            deprecated_fp8_dot_operand_dtypes.update({"fp8e5b16", "fp8e4b8"})
+            args["deprecated_fp8_dot_operand_dtypes"] = tuple(sorted(deprecated_fp8_dot_operand_dtypes))
 
         if "enable_fp_fusion" not in opts:
             args["enable_fp_fusion"] = knobs.language.default_fp_fusion
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp