[Bench][AMD] Fix matmul tests for gfx950 (#6965)

knwng · web-flow · commit 855fe3a9d91a · 2025-06-09T19:15:27.000-07:00
This PR is part of the efforts fixing the tests for the benchmark on AMD
gfx950 hardware.
With it now `pytest -s test_matmul.py` has no failures (some tests are
skipped for now.)
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -33,8 +33,8 @@ def quantize(w, dtype, dev, **opt):
                    MicroscalingCtx()
     else:
         assert dtype == "mx4", f"{dtype=}"
-        swizzle_mx_scale = opt["swizzle_mx_scale"]
-        swizzle_mx_value = opt["swizzle_mx_value"]
+        swizzle_mx_scale = opt.get("swizzle_mx_scale", None)
+        swizzle_mx_value = opt.get("swizzle_mx_value", None)
         swizzle_axis = 2 if swizzle_mx_scale else None
         w = w.to(torch.bfloat16)
         w, mx_scales, weight_scale_shape = downcast_to_mxfp(w, torch.uint8, axis=1, swizzle_axis=swizzle_axis,
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -17,7 +17,7 @@
 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
-from triton_kernels.target_info import is_hip
+from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4
 
 # ---------------
 # initialize data
@@ -75,18 +75,19 @@ def init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_sh
 # ---------------
 
 
-def init_precision(out_dtype, act_use_flexpoint, weight_use_flexpoint, n_expts_tot=1, mx_ctx=MicroscalingCtx(),
-                   device="cuda"):
+def init_precision(out_dtype, weight_dtype, is_mixed_input, n_expts_tot=1, mx_ctx=MicroscalingCtx(), device="cuda"):
+    act_use_flexpoint = out_dtype.itemsize == 1
+    weight_use_flexpoint = weight_dtype.itemsize == 1 and not is_mixed_input
     # flexpoint
     make_tensor = lambda val0, val1: torch.tensor([val0, val1] * (n_expts_tot // 2) +
                                                   ([val0]
                                                    if n_expts_tot % 2 else []), dtype=torch.float32, device=device)
     make_scalar = lambda val: torch.tensor([val], dtype=torch.float32, device=device)
-    in_flex_data = lambda scale, use_flex: InFlexData(dtype=torch.float8_e5m2, scale=make_scalar(scale)
+    in_flex_data = lambda scale, use_flex: InFlexData(dtype=out_dtype, scale=make_scalar(scale)
                                                       ) if use_flex else InFlexData()
-    in_flex_edata = lambda scale0, scale1, use_flex: InFlexData(dtype=torch.float8_e5m2, scale=make_tensor(
-        scale0, scale1)) if use_flex else InFlexData()
-    out_flex_data = lambda scale, use_flex: OutFlexData(dtype=torch.float8_e5m2, expected_scale=make_scalar(
+    in_flex_edata = lambda scale0, scale1, use_flex: InFlexData(dtype=weight_dtype, scale=make_tensor(scale0, scale1)
+                                                                ) if use_flex else InFlexData()
+    out_flex_data = lambda scale, use_flex: OutFlexData(dtype=out_dtype, expected_scale=make_scalar(
         scale), actual_scale=make_scalar(0), checksum_scale=make_scalar(0)) if use_flex else OutFlexData()
     flex_ctx = FlexCtx(
         lhs_data=in_flex_data(1.25, act_use_flexpoint),
@@ -211,8 +212,11 @@ class Case:
             Case(1000, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 3, 1),
             Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2),
             Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2, n_expt_shards=2),
-            Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2),
             Case(600, 400, 400, "ragged", "float8_e4m3fnuz", "float8_e4m3fnuz", 4, 2, split_k=2),
+            Case(300, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn"),
+            Case(1000, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 3, 1),
+            Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2),
+            Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2, n_expt_shards=2),
         ]
     ],
 )
@@ -230,16 +234,26 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
             n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
             device, opt_flags_scope):
     # TODO: remove when Triton FP8 supports proper RTNE
-    if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
-        pytest.skip("Float8 not tested on A100")
-    if "float8_e4m3fnuz" in weight_dtype_str and not is_hip():
-        pytest.skip("float8_e4m3fnuz only tested on HIP platforms")
-    if "mx" in weight_dtype_str and is_hip():
-        pytest.skip("mxfloat* only tested on CUDA platforms")
-    if "float16" in act_dtype_str and "mx" in weight_dtype_str and torch.cuda.get_device_capability()[0] >= 10:
-        pytest.skip("float16 x mx not supported with cuda capability >= 10")
-    if "float8" in act_dtype_str and "mx" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 10:
-        pytest.skip("float8 x mx not supported with cuda capability < 10")
+    if is_cuda():
+        if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
+            pytest.skip("Float8 not tested on A100")
+        if "float16" in act_dtype_str and "mx" in weight_dtype_str and torch.cuda.get_device_capability()[0] >= 10:
+            pytest.skip("float16 x mx not supported with cuda capability >= 10")
+        if "float8" in act_dtype_str and "mx" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 10:
+            pytest.skip("float8 x mx not supported with cuda capability < 10")
+    elif is_hip():
+        if "float8" in act_dtype_str and "mx" in weight_dtype_str and not is_hip_cdna4():
+            pytest.skip("float8 x mx only supported on CDNA4")
+        if "float8" in act_dtype_str and "mxfloat8" in weight_dtype_str:
+            pytest.skip("NYI: float8 x mxfloat8 not tested on AMD GPU")
+        if is_persistent:
+            pytest.skip("NYI: Persistent kernel not supported on AMD GPU")
+        if split_k > 1:
+            pytest.skip("splitK hasn't been fully tested on AMD GPU.")
+
+    if "float8_e4m3fnuz" in (weight_dtype_str, act_dtype_str) and not is_hip_cdna3():
+        pytest.skip("float8_e4m3fnuz only tested on AMD CDNA3 Platform")
+
     if fused_scatter and split_k > 1:
         pytest.skip("fused scatter scratchpad not supported with split_k")
     if hbm_swizzling:
@@ -284,9 +298,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
     weight_dtype = dtype_str_to_torch(weight_dtype_str)
     act_dtype = dtype_str_to_torch(act_dtype_str)
     act_is_float8 = act_dtype.itemsize == 1
-    weight_is_float8 = weight_dtype.itemsize == 1
-    precision_opt = init_precision(act_dtype, act_is_float8, weight_is_float8 and not is_mixed_input,
-                                   n_expts_tot // n_expt_shards, device=device)
+    precision_opt = init_precision(act_dtype, weight_dtype, is_mixed_input, n_expts_tot // n_expt_shards, device=device)
     # precision_opt.x_pad_trans_requires_flexpoint = False
     if mode == "ragged":
         m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter,
@@ -456,7 +468,7 @@ def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter,
     else:
         rdata = gindx = sindx = None
 
-    precision_opt = init_precision(act_dtype, False, False, n_expts_tot // n_expt_shards, device=device)
+    precision_opt = init_precision(act_dtype, weight_dtype, False, n_expts_tot // n_expt_shards, device=device)
     x, w, bias, _, _ = init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode,
                                          act_dtype, weight_dtype, False, requires_grad=False, device=device)
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -313,7 +313,7 @@ def apply_preprocessing_features(x, w, gather_indx, scatter_indx, routing_data,
     has_fused_scatter_scratchpad = opt_flags.fused_scatter and routing_data.n_expts_act > 1
     if has_fused_scatter_scratchpad:
         M = scatter_indx.src_indx.shape[0]
-        writeback_idxs = torch.empty((M,), dtype=torch.int32, device=x.device)
+        writeback_idxs = torch.zeros((M,), dtype=torch.int32, device=x.device)
         writeback_size = writeback_idxs.shape[0]
         finalize_scatter_idxs = torch.zeros((M // routing_data.n_expts_act + M + 1,), dtype=torch.int32, device=x.device)
         BLOCK_M=256
@@ -494,12 +494,12 @@ def init_allocation(x, w, precision_config, fused_activation, routing_data, gath
 def apply_allocation(allocation: MatmulAllocation, output):
     ret = dict()
     if output is None:
-        output = torch.empty(allocation.output[0], device=allocation.device, dtype=allocation.output[1])
+        output = torch.zeros(allocation.output[0], device=allocation.device, dtype=allocation.output[1])
     else:
         assert output.shape == allocation.output[0]
     ret["output"] = output[None, :, :]
     ret["scratchpad"] = {
-        k: torch.empty(v[0], device=allocation.device, dtype=v[1])
+        k: torch.zeros(v[0], device=allocation.device, dtype=v[1])
             for k, v in allocation.scratchpads.items()
     }
     return ret
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -81,10 +81,7 @@ def make_default_opt_flags_amd(
     # TODO: Does opt_flags_amd.compute_block_nk need to be refactored?
     if constraints.get("block_k", None) is not None:
         block_k = constraints["block_k"]
-    if constraints.get("is_persistent", None) is not None:
-        is_persistent = constraints["is_persistent"]
-    else:
-        is_persistent = False
+    is_persistent = constraints.get("is_persistent", False)
     # split_k:
     if constraints.get("split_k", None) is not None:
         split_k = constraints["split_k"]
@@ -99,14 +96,6 @@ def make_default_opt_flags_amd(
     # num_warps, num_stages
     num_warps = 2 if (m is not None and m <= 16) else 8
     num_stages = 2
-    if constraints.get("fused_scatter", None) is not None:
-        fused_scatter = constraints["fused_scatter"]
-    else:
-        fused_scatter = False
-    if constraints.get("epilogue_subtile", None) is not None:
-        epilogue_subtile = constraints["epilogue_subtile"]
-    else:
-        epilogue_subtile = None
     # AMD-specific
     target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1}
     ret = OptFlags(
@@ -119,9 +108,9 @@ def make_default_opt_flags_amd(
         xcd_swizzle=xcd_swizzle,
         w_cache_modifier=w_cache_modifier,
         split_k=split_k,
-        fused_scatter=fused_scatter,
+        fused_scatter=constraints.get('fused_scatter', False),
         is_persistent=is_persistent,
-        epilogue_subtile=epilogue_subtile,
+        epilogue_subtile=constraints.get('epilogue_subtile', None),
         arch=None,
         target_kernel_kwargs=target_kernel_kwargs,
     )
diff --git a/python/triton_kernels/triton_kernels/target_info.py b/python/triton_kernels/triton_kernels/target_info.py
@@ -4,12 +4,35 @@
 cached_capabilities = {}
 
 
+def is_cuda():
+    if "is_cuda" not in cached_capabilities:
+        target = triton.runtime.driver.active.get_current_target()
+        cached_capabilities["is_cuda"] = False if target is None else target.backend == "cuda"
+    return cached_capabilities["is_cuda"]
+
+
 def is_hip():
     if "is_hip" not in cached_capabilities:
         cached_capabilities["is_hip"] = torch.cuda.is_available() and bool(torch.version.hip)
     return cached_capabilities["is_hip"]
 
 
+def is_hip_cdna3():
+    if "is_hip_cdna3" not in cached_capabilities:
+        target = triton.runtime.driver.active.get_current_target()
+        cached_capabilities["is_hip_cdna3"] = (target is not None and target.backend == 'hip'
+                                               and target.arch == 'gfx942')
+    return cached_capabilities["is_hip_cdna3"]
+
+
+def is_hip_cdna4():
+    if "is_hip_cdna4" not in cached_capabilities:
+        target = triton.runtime.driver.active.get_current_target()
+        cached_capabilities["is_hip_cdna4"] = (target is not None and target.backend == 'hip'
+                                               and target.arch == 'gfx950')
+    return cached_capabilities["is_hip_cdna4"]
+
+
 def cuda_capability_geq(major, minor=0):
     """
     Determines whether we have compute capability >= (major, minor) and