[FA] Specify large GRF in autotune (#2410)

whitneywhtsang · web-flow · commit 6f89dbecf781 · 2024-10-02T13:39:44.000-04:00
By specifying GRF mode explicitly, the number of runs can be reduced. CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/11148146767 Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -153,7 +153,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out,  #
 
 
 configs = [
-    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN}, num_stages=s, num_warps=w) \
+    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large'}, num_stages=s, num_warps=w) \
     for BM in [256] \
     for BN in [32, 64] \
     for s in [3] \