[Bench][AMD] Tune MoE compilation config for GFX950 (#7127)

knwng · web-flow · commit ff57a4dbc81b · 2025-06-10T11:25:28.000-07:00
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -198,7 +198,7 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     x_bw = [x_bw[0], x_comp[0]]
     y_bw = [opints[0] * max_tbps, max_tflops]
     y_comp = [max_tflops] * len(x_comp)
-    ax.plot(x_bw, y_bw, "--", label=f"BW-bound  ({max_tbps:.0f} TB/s)")
+    ax.plot(x_bw, y_bw, "--", label=f"BW-bound  ({max_tbps:.1f} TB/s)")
     ax.plot(x_comp, y_comp, "--", label=f"Compute-bound  ({max_tflops:.0f} TFLOP/s)")
     # plot data
     ax.scatter(xs, perf, marker="+")
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 import triton
 from triton_kernels.numerics_details.mxfp import SwizzlingType
+from triton_kernels.target_info import get_cdna_version
 import torch
 
 from . import opt_flags_amd, opt_flags_nvidia
@@ -55,15 +56,20 @@ def make_default_opt_flags_amd(
         tokens_per_expt = max(1, m // routing_data.n_expts_tot)
     else:
         tokens_per_expt = routing_data.expected_tokens_per_expt
+
+    is_cdna4 = get_cdna_version() == 4
     # block_m
     if constraints.get("block_m", None):
         block_m = constraints["block_m"]
     elif enforce_bitwise_invariance:
-        block_m = 128
+        block_m = 256 if is_cdna4 else 128
     elif tokens_per_expt >= 512 and n >= 2048:
+        block_m = 256 if is_cdna4 else 128
+    elif is_cdna4 and m >= 512:
         block_m = 128
     else:
         block_m = max(32, min(triton.next_power_of_2(tokens_per_expt), 64))
+
     if routing_data is not None:
         grid_m = routing_data.n_blocks(m, block_m)
     else:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_amd.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_amd.py
@@ -5,7 +5,7 @@
 
 def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microscaling_ctx):
     lhs_width = lhs_dtype.itemsize
-    rhs_width = rhs_dtype.itemsize if microscaling_ctx.weight_scale is None else 0.5
+    rhs_width = rhs_dtype.itemsize if rhs_dtype != torch.uint8 else 0.5
 
     # block_n:
     n_cu = torch.cuda.get_device_properties(0).multi_processor_count
@@ -27,6 +27,6 @@ def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microsc
 
     # TODO: block_k = 128 seems to work better for now.
     #       perhaps due to increased number of k loops to pipeline
-    if microscaling_ctx.weight_scale is not None:
+    if microscaling_ctx.weight_scale is not None and get_cdna_version() != 4:
         block_k = 128
     return block_n, block_k
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -7,6 +7,7 @@
 from .routing_details._routing_compute import _routing_clear_bitmatrix
 from .routing_details._expt_data import _expt_data_memset
 from .routing_details._expt_data import _expt_data_compute
+from .target_info import is_hip
 
 
 @dataclass
@@ -202,7 +203,7 @@ def compute_expt_data(expt_hist, n_expts_tot, n_gates):
     cdiv = triton.cdiv
     # block_ms are all powers-of-two between 16 and 128 (inclusive)
     block_m_log2_start = 4
-    block_m_log2_end = 8
+    block_m_log2_end = 9 if is_hip() else 8
     block_m_num = block_m_log2_end - block_m_log2_start
     if n_gates <= n_expts_tot:
         max_n_tiles = n_gates