[TESTS] Reduce AOT test time (#7068)

Jokeren · web-flow · commit c6d96249841c · 2025-06-06T09:48:59.000-04:00
Remove duplicate tests and unnecessary synchronization.

On some platforms it reduces the time from 50s to 20s
diff --git a/python/test/unit/tools/test_aot.py b/python/test/unit/tools/test_aot.py
@@ -143,7 +143,6 @@ def gen_test_bin(dir, M, N, K, exe="test", algo_id=0):
   cuMemcpyHtoD(B, hB, K*N*2);
 
   // launch kernel
-  cuStreamSynchronize(stream);
   CUresult ret;
   int algo_id = {algo_id};
   if (algo_id == 0) {{
@@ -154,8 +153,6 @@ def gen_test_bin(dir, M, N, K, exe="test", algo_id=0):
   if (ret != 0) fprintf(stderr, "kernel launch failed\\n");
   assert(ret == 0);
 
-  cuStreamSynchronize(stream);
-
   // read data
   int32_t hC[M*N];
   memset(hC, 0, M*N*4);
@@ -241,21 +238,20 @@ def compile_aot_kernel_no_specialization(dir, kernel_path, dtype, BM, BN, BK):
 
 def compile_aot_kernels(dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints):
     # compile all desired configs
-    for ha in ha_hb_hints:
-        for hb in ha_hb_hints:
-            sig = f"*fp32:16, *{dtype}:16, *{dtype}:16, i32, i32, i32, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}"
-            name = f"matmul_{dtype}"
-            grid = f"M/{BM}, N/{BN}, 1"
-            _compile_kernel(
-                dir=dir,
-                signature=sig,
-                kernel_name="kernel",
-                out_name=name,
-                out_path=name,
-                num_warps=1,
-                grid=grid,
-                kernel_path=kernel_path,
-            )
+    for ha, hb in ha_hb_hints:
+        sig = f"*fp32:16, *{dtype}:16, *{dtype}:16, i32, i32, i32, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}"
+        name = f"matmul_{dtype}"
+        grid = f"M/{BM}, N/{BN}, 1"
+        _compile_kernel(
+            dir=dir,
+            signature=sig,
+            kernel_name="kernel",
+            out_name=name,
+            out_path=name,
+            num_warps=1,
+            grid=grid,
+            kernel_path=kernel_path,
+        )
 
 
 def link_aot_kernels(dir):
@@ -317,7 +313,7 @@ def test_compile_link_matmul():
         BM, BN, BK = 16, 16, 16
 
         kernel_path = write_triton_kernels(tmp_dir, kernel_src, kernel_utils_src)
-        compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=["", ":16"])
+        compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=[(":16", ":16")])
         link_aot_kernels(tmp_dir)
 
         # compile test case
@@ -348,7 +344,7 @@ def test_launcher_has_no_available_kernel():
         BM, BN, BK = 16, 16, 16
 
         kernel_path = write_triton_kernels(tmp_dir, kernel_src, kernel_utils_src)
-        compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=[":1"])
+        compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=[(":1", ":1")])
         link_aot_kernels(tmp_dir)
 
         # compile test case
@@ -385,14 +381,13 @@ def test_compile_link_autotune_matmul():
 
         tile_sizes = [
             [16, 16, 16],
-            [32, 32, 16],
-            [32, 32, 32],
             [64, 64, 32],
         ]
 
         for ts in tile_sizes:
             BM, BN, BK = ts[0], ts[1], ts[2]
-            compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=["", ":16"])
+            compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=[(":16", ":16"), (":16", ""),
+                                                                                      ("", ":16")])
 
         link_aot_kernels(tmp_dir)