Use well tuned kernel options for flex attention (#4484)

yudongsi · chengjunlu · whitneywhtsang · web-flow · commit f737aee156e5 · 2025-06-16T06:15:33.000Z
Geomean speedup is 1.45x on PVC max1100. ![image](https://github.com/user-attachments/assets/69085d53-ba75-43ad-b3e8-dfb87516b47c) --------- Signed-off-by: Lu,Chengjun <chengjun.lu@intel.com> Co-authored-by: Lu,Chengjun <chengjun.lu@intel.com> Co-authored-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py b/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py
@@ -9,9 +9,35 @@
 
 import torch
 import torch.nn.functional as F
+import torch._inductor
+import torch._inductor.lowering
+import torch._inductor.kernel
+import torch._inductor.kernel.flex_attention as flex_attn
+import torch._inductor.virtualized
 
 import triton_kernels_benchmark as benchmark_suit
 
+# Use TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 or uncomment the following line to print the auto-tune results.
+# torch._inductor.config.max_autotune_gemm = True
+
+
+def get_xpu_config(*args, **kwargs):  # pylint: disable=unused-argument
+    # BLOCK_M, BLOCK_N, num_warps, num_stages
+    configs = [
+        (32, 16, 4, 2),
+        (128, 64, 16, 2),
+        (128, 64, 8, 2),
+        (128, 32, 16, 2),
+        (128, 32, 8, 2),
+    ]
+    return configs
+
+
+# There is a auto-tuning requirement to get the best configuration for the flex attention.
+# The pytorch flex attention doesn't support auto-tuning by user by default.
+# Overriding the _get_xpu_config method to provide custom configurations for auto-tuning on XPU.
+flex_attn._get_xpu_config = get_xpu_config  # pylint: disable=protected-access
+
 torch._dynamo.config.recompile_limit = 100  # pylint: disable=protected-access
 
 # Compile the flex_attention function
@@ -112,7 +138,7 @@ def benchmark(Z, H_q, H_kv, N_CTX_q, N_CTX_kv, D_HEAD_qk, D_HEAD_v, MODE, provid
         _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(torch_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
 
     elif provider == 'triton':
-        kernel_options = {'num_stages': 2, 'num_warps': 16 if D_HEAD_qk == 128 else 8, 'BLOCKS_ARE_CONTIGUOUS': True}
+        kernel_options = {'BLOCKS_ARE_CONTIGUOUS': True}
         triton_fn = lambda: compiled_flex_attention(q, k, v, block_mask=block_mask, scale=sm_scale, enable_gqa=(
             not H_q == H_kv), kernel_options=kernel_options)
         if MODE == 'bwd':
diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
@@ -37,3 +37,4 @@ echo "Applying PyTorch patches in $REPO_ROOT"
 # put your patch applies here
 apply_patch https://github.com/pytorch/pytorch/pull/143553.diff
 apply_patch pytorch_fp64.patch
+apply_patch ./patch/Patch_torch_flex_attention_for_autotune_in_benchmark.patch
diff --git a/scripts/patch/Patch_torch_flex_attention_for_autotune_in_benchmark.patch b/scripts/patch/Patch_torch_flex_attention_for_autotune_in_benchmark.patch
@@ -0,0 +1,23 @@
+Subject: [PATCH] Patch torch flex attention for autotune in benchmark
+---
+Index: torch/_inductor/kernel/flex_attention.py
+IDEA additional info:
+Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
+<+>UTF-8
+===================================================================
+diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
+--- a/torch/_inductor/kernel/flex_attention.py	(revision 71e4cab58c04534b7608b4b01685180797271407)
++++ b/torch/_inductor/kernel/flex_attention.py	(date 1749737580817)
+@@ -1643,7 +1643,11 @@
+ 
+     choices: list[Any] = []
+     configs: list[tuple[int, int, int, int]] = []
+-    configs.append(_get_default_config_fwd(query))
++    default_configs = _get_default_config_fwd(query)
++    if isinstance(default_configs, tuple):
++        configs.append(default_configs)
++    else:
++        configs.extend(default_configs)
+     if config.max_autotune:
+         configs += [
+             (128, 64, 4, 3),