[FA] Add option to specify tuning parameters (#3293)

victor-eds · web-flow · commit c74b88ee0727 · 2025-01-29T10:24:00.000-05:00
Add option specifying tuning parameters. Users can override default
parameters passing a list of options to autotune from. This way, passing
`-BLOCK-M 64 32 128` would mean these values for `BLOCK_M` are used for
autotuning.

Also split options in two different option groups so the help string
looks something like:

```
usage: flash-attention [-h] -Z Z -H H -N-CTX N_CTX -D-HEAD D_HEAD [-causal] [-backward] [-BLOCK-M BLOCK_M [BLOCK_M ...]] [-BLOCK-N BLOCK_N [BLOCK_N ...]] [-stages STAGES [STAGES ...]] [-warps WARPS [WARPS ...]]

Run Intel XPU Flash-Attention implementation

options:
  -h, --help            show this help message and exit

Model description:
  Options setting different model metaparameters

  -Z Z                  Batch size
  -H H                  Head count
  -N-CTX N_CTX          Sequence length
  -D-HEAD D_HEAD        Embedding dimension
  -causal               Run causal attention
  -backward             Run backward attention

Tuning configuration:
  Options setting different tuning parameters

  -BLOCK-M BLOCK_M [BLOCK_M ...]
                        Sizes of M
  -BLOCK-N BLOCK_N [BLOCK_N ...]
                        Sizes of N
  -stages STAGES [STAGES ...]
                        Numbers of stages
  -warps WARPS [WARPS ...]
                        Numbers of warps
```

---------

Signed-off-by: victor-eds &lt;victor.perez@codeplay.com&gt;
diff --git a/scripts/flash_attention.py b/scripts/flash_attention.py
@@ -3,22 +3,49 @@
 import argparse
 
 import torch
+import triton
 
-from triton_kernels_benchmark.flash_attention_benchmark import _attention
+from triton_kernels_benchmark.flash_attention_benchmark import _attention, tune_attn_fwd
 
 
 def get_options():
     """Gather CL options."""
     parser = argparse.ArgumentParser(prog='flash-attention', description='Run Intel XPU Flash-Attention implementation')
-    parser.add_argument('-Z', type=int, required=True, help='Batch size')
-    parser.add_argument('-H', type=int, required=True, help='Head count')
-    parser.add_argument('-N-CTX', type=int, required=True, help='Sequence length')
-    parser.add_argument('-D-HEAD', type=int, required=True, help='Embedding dimension')
-    parser.add_argument('-causal', action='store_true', help='Run causal attention')
-    parser.add_argument('-backward', action='store_true', help='Run backward attention')
+
+    model = parser.add_argument_group(title='Model description',
+                                      description='Options setting different model metaparameters')
+    model.add_argument('-Z', type=int, required=True, help='Batch size')
+    model.add_argument('-H', type=int, required=True, help='Head count')
+    model.add_argument('-N-CTX', type=int, required=True, help='Sequence length')
+    model.add_argument('-D-HEAD', type=int, required=True, help='Embedding dimension')
+    model.add_argument('-causal', action='store_true', help='Run causal attention')
+    model.add_argument('-backward', action='store_true', help='Run backward attention')
+
+    config = parser.add_argument_group(title='Tuning configuration',
+                                       description='Options setting different tuning parameters')
+    config.add_argument('-BLOCK-M', action='extend', nargs='+', type=int, help='Sizes of M')
+    config.add_argument('-BLOCK-N', action='extend', nargs='+', type=int, help='Sizes of N')
+    config.add_argument('-stages', action='extend', nargs='+', type=int, help='Numbers of stages')
+    config.add_argument('-warps', action='extend', nargs='+', type=int, help='Numbers of warps')
     return parser.parse_args()
 
 
+def get_configs(options):
+    """Get autotuning configurations."""
+    bm_values = options.BLOCK_M if options.BLOCK_M else [128, 256]
+    bn_values = options.BLOCK_N if options.BLOCK_N else [32, 64]
+    stages_values = options.stages if options.stages else [3, 4]
+    warps_values = options.warps if options.warps else [8, 16, 32]
+    return [
+        triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True},
+                      num_stages=s, num_warps=w)
+        for BM in bm_values
+        for BN in bn_values
+        for s in stages_values
+        for w in warps_values
+    ]
+
+
 def run(options):
     """Run the XPU backend FlashAttention benchmark implementation."""
     dtype = torch.float16
@@ -27,6 +54,9 @@ def run(options):
     k = torch.randn_like(q, device='xpu', dtype=dtype, requires_grad=True)
     v = torch.randn_like(q, device='xpu', dtype=dtype, requires_grad=True)
     sm_scale = 0.125
+
+    tune_attn_fwd.configs = get_configs(options)
+
     attention = _attention.apply
     triton_o = attention(q, k, v, options.causal, sm_scale)
     if options.backward: