[GEMM] Add autotune configs of num_warps=64 (#3297)

whitneywhtsang · web-flow · commit 7ad67a03302b · 2025-01-28T14:55:29.000-05:00
We can see from https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/13004109008 that those added configs are selected as best autotune configs for some input shapes. The actual geomean performance gain is less than 2%. The biggest per shape improvement is 12%. Below is the breakdown for improvement per input shape: B | M | N | K | ratio -- | -- | -- | -- | -- 1 | 1 | 13824 | 5120 | 1.07916 1 | 4 | 12288 | 4096 | 1.124677 1 | 512 | 8192 | 8192 | 1.081075 1 | 512 | 8192 | 32768 | 1.02523 1 | 8192 | 1024 | 16384 | 1.028267 1 | 16384 | 1024 | 8192 | 1.094302 1 | 16384 | 8192 | 1024 | 1.051715 4 | 32768 | 128 | 4096 | 1.019839 4 | 32768 | 4096 | 128 | 1.023059 32 | 4096 | 128 | 4096 | 1.019211 Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py
@@ -26,17 +26,17 @@
             {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': 'large'},
             num_stages=s, num_warps=32) for s in [1, 2, 3]
     ] + [
-        triton.Config(
-            {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': 'large'},
-            num_stages=s, num_warps=32) for s in [2, 3, 4]
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': m},
+                      num_stages=s, num_warps=w) for s in [2, 3, 4] for (m, w) in
+        ([('large', 32), ('small', 64)] if os.getenv('TRITON_INTEL_ADVANCED_PATH', '0') == '0' else [('large', 32)])
     ] + [
         triton.Config(
             {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': 'large'},
             num_stages=s, num_warps=32) for s in [2]
     ] + [
-        triton.Config(
-            {'BLOCK_SIZE_M': 8, 'BLOCK_SIZE_N': 512, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'grf_mode': 'large'},
-            num_stages=s, num_warps=32) for s in [2, 3]
+        triton.Config({'BLOCK_SIZE_M': 8, 'BLOCK_SIZE_N': 512, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'grf_mode': m},
+                      num_stages=s, num_warps=w) for s in [2, 3] for (m, w) in
+        ([('large', 32), ('small', 64)] if os.getenv('TRITON_INTEL_ADVANCED_PATH', '0') == '0' else [('large', 32)])
     ],
     key=['M', 'N', 'K'],
 )
@@ -91,9 +91,9 @@ def matmul_kernel_with_block_pointers(
             {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': 'large'},
             num_stages=s, num_warps=32) for s in [2, 3]
     ] + [
-        triton.Config(
-            {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': 'large'},
-            num_stages=s, num_warps=32) for s in [2]
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'grf_mode': m},
+                      num_stages=s, num_warps=w) for s in [2] for (m, w) in
+        ([('large', 32), ('small', 64)] if os.getenv('TRITON_INTEL_ADVANCED_PATH', '0') == '0' else [('large', 32)])
     ] + [
         triton.Config(
             {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 1024, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 4, 'grf_mode': 'large'},