[AMD] Fixing test_aot.py::test_gluon_kernel for gfx1250 (#8958)

guacamoleo · web-flow · commit 8b3fb1e3c5c8 · 2025-12-10T15:39:02.000-08:00
Test `python/test/unit/tools/test_aot.py::test_gluon_kernel` fails on
gfx1250 since it has some properties (64 threads/wave) hard-coded to
gfx942, however pulls compilation target from the driver (despite this
being aot-compilation).

With gluon being somewhat architecture specific, this PR modifies aot
testing to specify the architecture which gluon is targeting; fixes the
above-mentioned test failure for gfx1250 (32 threads/wave).
diff --git a/python/test/unit/tools/test_aot.py b/python/test/unit/tools/test_aot.py
@@ -63,7 +63,9 @@ def kernel(C, A, B, M, N, K,
   tl.store(c_ptrs, c)
 """
 
-gluon_kernel_src = """
+
+def get_gluon_kernel_src(threads_per_warp):
+    return f"""
 from triton.experimental import gluon
 from triton.experimental.gluon import language as gl
 
@@ -77,12 +79,13 @@ def kernel(
     BLOCK_N: gl.constexpr,
     BLOCK_K: gl.constexpr
 ):
-    layout: gl.constexpr = gl.BlockedLayout(size_per_thread=[1], threads_per_warp=[64], warps_per_cta=[1], order=[0])
+    layout: gl.constexpr = gl.BlockedLayout(size_per_thread=[1], threads_per_warp=[{threads_per_warp}], warps_per_cta=[1], order=[0])
     offs = gl.arange(0, 64, layout=layout)
     a = gl.load(A + offs)
     gl.store(B + offs, a)
 """
 
+
 test_utils_src = """
 #include <cuda.h>
 #include <stdio.h>
@@ -215,34 +218,21 @@ def write_triton_kernels(dir, src, util_src):
     return kernel_path
 
 
-def _compile_kernel(dir, signature, kernel_name, out_name, out_path, num_warps, grid, kernel_path):
+def _compile_kernel(dir, signature, kernel_name, out_name, out_path, num_warps, grid, kernel_path, target=None):
     compiler_path = os.path.join(triton.tools.__path__[0], "compile.py")
-
-    subprocess.run(
-        [
-            sys.executable,
-            compiler_path,
-            "-n",
-            kernel_name,
-            "--signature",
-            signature,
-            "--out-name",
-            out_name,
-            "-o",
-            out_path,
-            "-w",
-            str(num_warps),
-            "-g",
-            grid,
-            kernel_path,
-        ],
-        check=True,
-        cwd=dir,
-    )
+    cmd_args = [
+        sys.executable, compiler_path, "-n", kernel_name, "--signature", signature, "--out-name", out_name, "-o",
+        out_path, "-w",
+        str(num_warps), "-g", grid
+    ]
+    if target:
+        cmd_args.extend(["-t", "%s:%s:%i" % (target.backend, target.arch, target.warp_size)])
+    cmd_args.append(kernel_path)
+    subprocess.run(cmd_args, check=True, cwd=dir)
 
 
 # Edge case kernel with no specialization
-def compile_aot_kernel_no_specialization(dir, kernel_path, dtype, BM, BN, BK):
+def compile_aot_kernel_no_specialization(dir, kernel_path, dtype, BM, BN, BK, target=None):
     # compile all desired configs
     sig = f"*fp32, *{dtype}, *{dtype}, i32, i32, i32, i32, i32, i32, i32, i32, i32, {BM}, {BN}, {BK}"
     name = f"matmul_{dtype}"
@@ -256,10 +246,11 @@ def compile_aot_kernel_no_specialization(dir, kernel_path, dtype, BM, BN, BK):
         num_warps=1,
         grid=grid,
         kernel_path=kernel_path,
+        target=target,
     )
 
 
-def compile_aot_kernels(dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints):
+def compile_aot_kernels(dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints, target=None):
     # compile all desired configs
     for ha, hb in ha_hb_hints:
         sig = f"*fp32:16, *{dtype}:16, *{dtype}:16, i32, i32, i32, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}"
@@ -274,6 +265,7 @@ def compile_aot_kernels(dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints):
             num_warps=1,
             grid=grid,
             kernel_path=kernel_path,
+            target=target,
         )
 
 
@@ -492,13 +484,13 @@ def test_ttgir_to_asm():
             assert '.wavefront_size: 64' in amdgcn
 
 
-def test_gluon_kernel():
-    if not is_hip():
-        pytest.skip("Gluon kernel is only supported on HIP")
+@pytest.mark.parametrize("target", [GPUTarget("hip", "gfx942", 64), GPUTarget("hip", "gfx1250", 32)])
+@pytest.mark.skipif(not is_hip(), reason="Requires HIP")
+def test_gluon_kernel(target):
     with tempfile.TemporaryDirectory() as tmp_dir:
         dtype = "fp16"
         BM, BN, BK = 16, 16, 16
-
+        gluon_kernel_src = get_gluon_kernel_src(target.warp_size)
         kernel_path = write_triton_kernels(tmp_dir, gluon_kernel_src, kernel_utils_src)
-        compile_aot_kernel_no_specialization(tmp_dir, kernel_path, dtype, BM, BN, BK)
+        compile_aot_kernel_no_specialization(tmp_dir, kernel_path, dtype, BM, BN, BK, target=target)
         check_hasco_binary_str(tmp_dir, dtype)