[FA Performance] Add configurations to FA auto-tuner (#3725)

Maxime France-Pillois · web-flow · commit 902fd39dc7fa · 2025-03-21T11:23:02.000-04:00
Enhance the FA auto-tuner to evaluate more configurations (including
CUTLASS configurations).
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py
@@ -157,7 +157,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out,  #
     triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True}, num_stages=s, num_warps=w) \
     for BM in [128, 256] \
     for BN in [32, 64] \
-    for s in [3, 4] \
+    for s in [2, 3, 4] \
     for w in [8, 16, 32] \
     ]
 

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out, #`
`157`	`157`	`triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True}, num_stages=s, num_warps=w) \`
`158`	`158`	`for BM in [128, 256] \`
`159`	`159`	`for BN in [32, 64] \`
`160`		`- for s in [3, 4] \`
	`160`	`+ for s in [2, 3, 4] \`
`161`	`161`	`for w in [8, 16, 32] \`
`162`	`162`	`]`
`163`	`163`