[FlexAttention] Add initial benchmarks (#3578)

Maxime France-Pillois · pbchekin · web-flow · commit c874647239e5 · 2025-03-17T09:18:21.000Z
Add benchmarks to evaluate flex attention kernels performances. 
Add these benchmarks to CI workflow (need to install a specific pytorch
version with XPU FlexAttention support enabled).

---------

Co-authored-by: Pavel Chekin &lt;pavel.chekin@intel.com&gt;
diff --git a/.github/actions/load/action.yml b/.github/actions/load/action.yml
@@ -37,8 +37,8 @@ runs:
         ITEM_PATH="${{ inputs.root }}/${{ inputs.key }}"
         echo "dest=$ITEM_PATH" >> $GITHUB_OUTPUT
         if [[ -d ${{ inputs.path }} ]]; then
-          echo "Directory ${{ inputs.path }} exists and will not be restored from cache"
-          exit 1
+          echo "Directory ${{ inputs.path }} already exists and will be removed"
+          rm -rf ${{ inputs.path }}
         fi
         if [[ ${{ inputs.enabled == 'true' }} && -d $ITEM_PATH ]]; then
           echo "Cache hit for ${{ inputs.key }}"
diff --git a/.github/actions/setup-pytorch/action.yml b/.github/actions/setup-pytorch/action.yml
@@ -45,8 +45,14 @@ runs:
       if: inputs.ref != ''
       shell: bash
       run: |
-        echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
-        echo "PYTORCH_COMMIT_ID=${{ steps.commit-id.outputs.commit_id }}" | tee -a "$GITHUB_ENV"
+        if [[ "${{ inputs.repository }}" = "liangan1/pytorch" ]]; then
+          PYTORCH_COMMIT_ID="$(<.github/pins/pytorchFlexAttention.txt)"
+          echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
+          echo "PYTORCH_COMMIT_ID=$PYTORCH_COMMIT_ID" | tee -a "$GITHUB_ENV"
+        else
+          echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
+          echo "PYTORCH_COMMIT_ID=${{ steps.commit-id.outputs.commit_id }}" | tee -a "$GITHUB_ENV"
+        fi
 
     - name: Identify Python version
       shell: bash
@@ -99,7 +105,7 @@ runs:
         path: pytorch
 
     - name: Apply additional PR patches
-      if: ${{ steps.pytorch-cache.outputs.status == 'miss' && inputs.repository == 'pytorch/pytorch' && inputs.mode == 'source' }}
+      if: ${{ steps.pytorch-cache.outputs.status == 'miss'  && inputs.mode == 'source' && (inputs.repository == 'pytorch/pytorch' || inputs.repository == 'liangan1/pytorch') }}
       shell: bash
       run: |
         cd pytorch
diff --git a/.github/pins/pytorchFlexAttention.txt b/.github/pins/pytorchFlexAttention.txt
@@ -0,0 +1 @@
+bbc1fc47e716e7e6d195f8a84de7f7f286836028
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -282,6 +282,34 @@ jobs:
           cd benchmarks/micro_benchmarks
           python run_benchmarks.py --reports $REPORTS
 
+      # Install Pytorch with FlexAttention XPU support enabled
+      - name: Setup PyTorch
+        uses: ./.github/actions/setup-pytorch
+        with:
+          repository: liangan1/pytorch
+          ref: liangan1/flex_attention
+
+      - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
+        run: |
+          cd benchmarks/triton_kernels_benchmark
+          python flex_attention_benchmark_causal_mask.py --reports $REPORTS
+
+          source ../../scripts/capture-hw-details.sh
+          python ../../scripts/build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flexAttnCausal --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../scripts/build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-xetla-report.csv --benchmark flexAttnCausal --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
+
+      - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
+        run: |
+          cd benchmarks/triton_kernels_benchmark
+          python flex_attention_benchmark_custom_masks.py --reports $REPORTS
+
+          source ../../scripts/capture-hw-details.sh
+          python ../../scripts/build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-triton-report.csv --benchmark flexAttnMasks --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG --mask
+          python ../../scripts/build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flexAttnMasks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask
+
+
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v4
diff --git a/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py b/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py
@@ -0,0 +1,137 @@
+# This benchmark requires a Pytorch version with FlexAttention support for XPU available
+from functools import lru_cache
+import os
+from torch.nn.attention.flex_attention import (
+    create_block_mask,
+    flex_attention,
+)
+
+import torch
+import torch.nn.functional as F
+import triton_kernels_benchmark as benchmark_suit
+from triton_kernels_benchmark import xetla_kernel
+
+# Compile the flex_attention function
+flex_attention = torch.compile(flex_attention, dynamic=False)
+
+
+@lru_cache
+def create_block_mask_cached(score_mod, B, H, M, N, device='xpu'):
+    block_mask = create_block_mask(score_mod, B, H, M, N, device=device)
+    return block_mask
+
+
+def causal_mask(_, __, q_idx, kv_idx):
+    return q_idx >= kv_idx
+
+
+# Kernel profiling for Backward mode is not working as expected:
+# For details: https://github.com/pytorch/pytorch/issues/144778
+@benchmark_suit.perf_report(
+    benchmark_suit.Benchmark(
+        x_names=['Z', 'H', 'N_CTX', 'D_HEAD', 'CAUSAL', 'MODE'],
+        x_vals=[[z, h, 16384 // z, dhead, causal, mode]
+                for z in [1, 2, 4, 8, 16, 32]
+                for (h, dhead) in [(16, 128), (32, 64)]
+                for causal in [True]
+                for mode in [os.getenv('FA_KERNEL_MODE', 'fwd')]]  #
+        + [[4, 48, 1024, 64, True, mode] for mode in [os.getenv('FA_KERNEL_MODE', 'fwd')]]  #
+        + [[z, h, 1024, dhead, True, mode]
+           for z in [1, 2, 4, 8, 16, 32, 64]
+           for (h, dhead) in [(8, 128), (32, 96), (4, 128)]
+           for mode in [os.getenv('FA_KERNEL_MODE', 'fwd')]],
+        line_arg='provider',
+        line_vals=['triton', 'xetla'],
+        line_names=['Triton', 'XeTLA'],
+        styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
+        ylabel=['GB/s', 'TFlops'],
+        plot_name='flexAttnCausal-performance',
+        args={},
+    ))
+def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
+    assert MODE in ['fwd', 'bwd']
+    assert CAUSAL
+    dtype = torch.float16
+    q = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
+    k = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
+    v = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
+    sm_scale = 0.125
+    if MODE == 'bwd':
+        sm_scale = 1.3
+
+    quantiles = [0.5, 0.0, 1.0]
+    if provider == 'triton':
+        block_mask = create_block_mask_cached(causal_mask, 1, 1, N_CTX, N_CTX, device=q.device)
+        triton_fn = lambda: flex_attention(q, k, v, block_mask=block_mask, scale=sm_scale)
+        if MODE == 'bwd':
+            triton_o = triton_fn()
+            triton_do = torch.randn_like(triton_o)
+            triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
+        torch_fn = lambda: F.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(), is_causal=True, scale=sm_scale).to(
+            torch.float32)
+        if MODE == 'bwd':
+            torch_o = torch_fn()
+            torch_do = torch.randn_like(torch_o)
+            torch_fn = lambda: torch_o.backward(torch_do, retain_graph=True)
+        if MODE == 'fwd':
+            atol = 1e-1 if N_CTX == 16384 else 1e-2
+            benchmark_suit.assert_close(triton_fn, torch_fn, atol=atol, rtol=1e-3, err_msg='triton to torch')
+        else:
+            benchmark_suit.assert_close(lambda: triton_o, lambda: torch_o, atol=1e-2, rtol=0, err_msg='triton to torch')
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
+
+    elif provider == 'xetla':
+        xetla_fn = None
+        if MODE == 'fwd':
+            module_name = 'flash_attn_causal_True'.lower()
+            func = getattr(xetla_kernel, module_name)
+            out = torch.empty_like(q, device='xpu', dtype=dtype)
+            size_score = Z * H * N_CTX * N_CTX
+            size_attn_mask = Z * N_CTX * N_CTX
+            dropout_mask = torch.empty((size_score, ), device='xpu', dtype=torch.uint8)
+            bias = torch.empty((size_attn_mask, ), device='xpu', dtype=dtype)
+            size_ml = Z * H * N_CTX
+            m = torch.empty((size_ml, ), device='xpu', dtype=torch.float)
+            l = torch.empty((size_ml, ), device='xpu', dtype=torch.float)
+            xetla_fn = lambda: func(q, k, v, out, dropout_mask, bias, m, l, Z, H, D_HEAD, N_CTX, N_CTX, sm_scale)
+        if MODE == 'bwd':
+            module_name = 'flash_attn_bwd_causal_True'.lower()
+            func = getattr(xetla_kernel, module_name)
+            grad_out = torch.empty_like(q, device='xpu', dtype=dtype, requires_grad=True)
+            bias = torch.empty_like(q, device='xpu', dtype=dtype, requires_grad=True)
+            dropout = torch.empty_like(q, device='xpu', dtype=torch.uint8)
+            out = torch.empty_like(q, device='xpu', dtype=dtype, requires_grad=True)
+            log_sumexp = torch.zeros(q.size(), device='xpu', dtype=dtype, requires_grad=True)
+            workspace = torch.zeros(q.size(), device='xpu', dtype=dtype, requires_grad=True)
+            grad_q_tmp = torch.zeros(q.size(), device='xpu', dtype=dtype, requires_grad=True)
+            alpha = sm_scale
+            dropout_prob = 0
+            grad_query = torch.empty_like(q, device='xpu', dtype=dtype, requires_grad=True)
+            grad_key = torch.empty_like(k, device='xpu', dtype=dtype, requires_grad=True)
+            grad_value = torch.empty_like(v, device='xpu', dtype=dtype, requires_grad=True)
+            grad_bias = torch.empty_like(bias, device='xpu', dtype=dtype, requires_grad=True)
+            bias_strideB = -1
+            bias_strideN = -1
+            bias_strideF = -1
+            attn_mask_padding = 0
+
+            xetla_fn = lambda: func(grad_out, q, k, v, bias, dropout, out, log_sumexp, workspace, grad_q_tmp, alpha,
+                                    dropout_prob, grad_query, grad_key, grad_value, grad_bias, Z, H, D_HEAD, N_CTX,
+                                    N_CTX, bias_strideB, bias_strideN, bias_strideF, attn_mask_padding)
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
+
+    else:
+        raise NotImplementedError(f'Unsupported provider {provider}')
+
+    tflops = lambda mean: 2 * 2 * Z * H * N_CTX * N_CTX * D_HEAD * (1e-12) / (mean * 1e-3)
+    gbps = lambda mean: Z * H * (N_CTX * D_HEAD + N_CTX * D_HEAD) * 2 * 2 * (1e-9) / (mean * 1e-3)
+
+    if MODE == 'bwd':
+        tflops = lambda mean: 2.5 * 2 * 2 * Z * H * N_CTX * N_CTX * D_HEAD * (1e-12) / (mean * 1e-3)
+        gbps = lambda mean: 2.5 * Z * H * (N_CTX * D_HEAD + N_CTX * D_HEAD) * 2 * 2 * (1e-9) / (mean * 1e-3)
+
+    return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
+
+
+if __name__ == '__main__':
+    benchmark.run(show_plots=False, print_data=True)
diff --git a/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_custom_masks.py b/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_custom_masks.py
@@ -0,0 +1,138 @@
+# This benchmark requires a Pytorch version with FlexAttention support for XPU available
+from functools import lru_cache
+import os
+from torch.nn.attention.flex_attention import (
+    create_block_mask,
+    create_mask,
+    flex_attention,
+)
+
+import torch
+import torch.nn.functional as F
+
+import triton_kernels_benchmark as benchmark_suit
+
+# Compile the flex_attention function
+flex_attention = torch.compile(flex_attention, dynamic=False)
+
+
+@lru_cache
+def create_block_mask_cached(score_mod, B, H, M, N, device='xpu'):
+    block_mask = create_block_mask(score_mod, B, H, M, N, device=device)
+    return block_mask
+
+
+# Default values for NATTEN mask:
+# Consider a 2D image of size (G_H x G_W) flattened into a sequence of tokens.
+# Queries attend to keys in a fixed kernel area (K_H x K_W)
+G_H = 128
+G_W = 128
+K_H = 13
+K_W = 13
+
+
+def get_x_y(idx):
+    return idx // G_W, idx % G_W
+
+
+def natten_mask(_, __, q_idx, kv_idx):
+    q_x, q_y = get_x_y(q_idx)
+    kv_x, kv_y = get_x_y(kv_idx)
+    # kernel nominally attempts to center itself on the query, but kernel center
+    # is clamped to a fixed distance (kernel half-length) from the canvas edge
+    kernel_x = q_x.clamp(K_W // 2, (G_W - 1) - K_W // 2)
+    kernel_y = q_y.clamp(K_H // 2, (G_H - 1) - K_H // 2)
+    hori_mask = (kernel_x - kv_x).abs() <= K_W // 2
+    vert_mask = (kernel_y - kv_y).abs() <= K_H // 2
+    return hori_mask & vert_mask
+
+
+def alibi_functional(score, _, h, q_idx, kv_idx):
+    scale = torch.exp2(-((h + 1) * 8.0 / G_H))
+    bias = (kv_idx - q_idx) * scale
+    return score + bias
+
+
+# Kernel profiling for Backward mode is not working as expected:
+# For details: https://github.com/pytorch/pytorch/issues/144778
+@benchmark_suit.perf_report(
+    benchmark_suit.Benchmark(
+        x_names=['Z', 'H', 'N_CTX', 'D_HEAD', 'MASK', 'MODE'],
+        x_vals=[[z, h, 16384 // z, dhead, mask, mode]
+                for z in [4, 8, 16, 32]
+                for (h, dhead) in [(16, 128), (32, 64)]
+                for mask in ['NATTEN', 'Alibi']
+                for mode in [os.getenv('FA_KERNEL_MODE', 'fwd')]]  #
+        + [[4, 48, 1024, 64, mask, mode]
+           for mask in ['NATTEN', 'Alibi']
+           for mode in [os.getenv('FA_KERNEL_MODE', 'fwd')]]  #
+        + [[z, h, 1024, dhead, mask, mode]
+           for z in [1, 2, 4, 8, 16, 32, 64]
+           for (h, dhead) in [(8, 128), (32, 96), (4, 128)]
+           for mask in ['NATTEN', 'Alibi']
+           for mode in [os.getenv('FA_KERNEL_MODE', 'fwd')]],
+        line_arg='provider',
+        line_vals=['triton', 'onednn'],
+        line_names=['Triton', 'OneDNN'],
+        styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
+        ylabel=['GB/s', 'TFlops'],
+        plot_name='flexAttnMasks-performance',
+        args={},
+    ))
+def benchmark(Z, H, N_CTX, D_HEAD, MASK, MODE, provider):
+    assert MODE in ['fwd', 'bwd']
+    assert MASK in ['NATTEN', 'Alibi']
+    dtype = torch.float16
+    q = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
+    k = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
+    v = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
+
+    mask_mod = None
+    score_mod = None
+    if MASK == 'NATTEN':
+        mask_mod = natten_mask
+    elif MASK == 'Alibi':
+        score_mod = alibi_functional
+
+    if mask_mod is not None:
+        block_mask = create_block_mask_cached(mask_mod, 1, 1, N_CTX, N_CTX, device=q.device)
+    else:
+        block_mask = None
+    sdpa_mask_fn = mask_mod if mask_mod is not None else score_mod
+    mask = create_mask(sdpa_mask_fn, 1, 1, N_CTX, N_CTX, device=q.device)
+
+    quantiles = [0.5, 0.0, 1.0]
+    if provider == 'triton':
+        triton_fn = lambda: flex_attention(q, k, v, score_mod=score_mod, block_mask=block_mask)
+        if MODE == 'bwd':
+            triton_o = triton_fn()
+            triton_do = torch.randn_like(triton_o)
+            triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=5, n_repeat=5, quantiles=quantiles)
+        # Values checking cannot be implemented for these case as :
+        # "The operator 'aten::_scaled_dot_product_flash_attention_for_cpu' is not currently implemented for the XPU device"
+
+    elif provider == 'onednn':
+        xformers_fn = lambda: F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        if MODE == 'bwd':
+            xformers_o = xformers_fn()
+            xformers_do = torch.randn_like(xformers_o)
+            xformers_fn = lambda: xformers_o.backward(xformers_do, retain_graph=True)
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xformers_fn, n_warmup=10, n_repeat=10,
+                                                              quantiles=quantiles)
+
+    else:
+        raise NotImplementedError(f'Unsupported provider {provider}')
+
+    tflops = lambda mean: 2 * 2 * Z * H * N_CTX * N_CTX * D_HEAD * (1e-12) / (mean * 1e-3)
+    gbps = lambda mean: Z * H * (N_CTX * D_HEAD + N_CTX * D_HEAD) * 2 * 2 * (1e-9) / (mean * 1e-3)
+
+    if MODE == 'bwd':
+        tflops = lambda mean: 2.5 * 2 * 2 * Z * H * N_CTX * N_CTX * D_HEAD * (1e-12) / (mean * 1e-3)
+        gbps = lambda mean: 2.5 * Z * H * (N_CTX * D_HEAD + N_CTX * D_HEAD) * 2 * 2 * (1e-9) / (mean * 1e-3)
+
+    return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
+
+
+if __name__ == '__main__':
+    benchmark.run(show_plots=False, print_data=True)
diff --git a/scripts/build_report.py b/scripts/build_report.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+bbc1fc47e716e7e6d195f8a84de7f7f286836028`