[Bench][AMD] Fix HIP capability checks in MoE kernel (#7545)

knwng · web-flow · commit 19eef7c06cf4 · 2025-07-17T15:49:32.000-07:00
Fix minor syntax issues on AMD side.
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -10,7 +10,7 @@
 from triton_kernels.matmul_ogs import matmul_ogs, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
 from triton_kernels.numerics import InFlexData
 from triton_kernels.routing import routing
-from triton_kernels.target_info import is_hip, get_cdna_version
+from triton_kernels.target_info import is_hip, get_cdna_version, is_cuda
 from triton_kernels.tensor import convert_layout
 from triton_kernels.tensor_details.layout import StridedLayout, BlackwellMXScaleLayout, HopperMXScaleLayout, HopperMXValueLayout
 from triton_kernels.tensor import wrap_torch_tensor, FP4
@@ -101,14 +101,15 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
     optg = dict()
     opt1 = dict()
     opt2 = dict()
-    if w_dtype == "mx4" and not is_hip():
+    if w_dtype == "mx4":
         value_layout = StridedLayout
         scale_layout = StridedLayout
-        if torch.cuda.get_device_capability()[0] == 9:
-            value_layout = HopperMXValueLayout
-            scale_layout = HopperMXScaleLayout
-        if torch.cuda.get_device_capability()[0] == 10:
-            scale_layout = BlackwellMXScaleLayout
+        if is_cuda():
+            if torch.cuda.get_device_capability()[0] == 9:
+                value_layout = HopperMXValueLayout
+                scale_layout = HopperMXScaleLayout
+            if torch.cuda.get_device_capability()[0] == 10:
+                scale_layout = BlackwellMXScaleLayout
         opt1 = {"value_layout": value_layout, "scale_layout": scale_layout}
         opt2 = deepcopy(opt1)
     wg, wg_flex, wg_scale = quantize(wg, "bf16", dev, **optg)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -7,6 +7,7 @@
 from triton_kernels import target_info
 from triton_kernels.numerics import InFlexData, OutFlexData
 from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx
+from triton_kernels.target_info import is_cuda
 # details
 from .matmul_ogs_details._matmul_ogs import _compute_writeback_idx
 from .matmul_ogs_details._matmul_ogs import _matmul_ogs
@@ -384,7 +385,7 @@ def matmul_ogs(x, w, bias,
     # unpack scales
     w_scale = precision_config.weight_scale
     has_mx = w_scale is not None
-    is_hopper_fp8 = not target_info.cuda_capability_geq(10, 0) and bitwidth(w.dtype) == 8
+    is_hopper_fp8 = is_cuda() and not target_info.cuda_capability_geq(10, 0) and bitwidth(w.dtype) == 8
     if has_mx: assert w.stride(-2) == 1, "`w` must be column-major when it has data-type mxfp"
     if is_hopper_fp8: assert w.stride(-2) == 1, "`w` must be column-major when it has data-type FP8 on capability < 10"
     if not isinstance(w, Tensor):