[FA] Update autotune configs on default path (#2475)

whitneywhtsang · web-flow · commit 8bb917fa9d25 · 2024-10-11T22:29:51.000-04:00
In this PR, we aim to allow configurations that are equivalent to the
one used on advanced path.
This PR gives 4% performance improvement on geomean of FA out of box.
On advanced path, `BLOCK_M` is 128, `num_warps` can be `8` or `16`.

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -154,10 +154,10 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out,  #
 
 configs = [
     triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large'}, num_stages=s, num_warps=w) \
-    for BM in [256] \
+    for BM in [128, 256] \
     for BN in [32, 64] \
-    for s in [3] \
-    for w in [32] \
+    for s in [3, 4] \
+    for w in [8, 16, 32] \
     ]
 
 tuner = triton.autotune(configs, key=['N_CTX', 'BLOCK_DMODEL'])