[CI] Run Flex Attention with batch size 4 (#4913)

whitneywhtsang · web-flow · commit b0acd14d9e88 · 2025-08-19T00:37:34.000-04:00
Flex Attention with batch size 16 (Torch implementation) fails on BMG: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/17038905956/job/48297736887. This PR skips running batch size 16 on BMG. In order to track the performance of Flex Attention with more than 1 batch size, this PR adds a run with batch size 4, which can be removed when batch size 16 is fixed. Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/.github/workflows/triton-benchmarks-bmg.yml b/.github/workflows/triton-benchmarks-bmg.yml
@@ -15,4 +15,4 @@ jobs:
     uses: ./.github/workflows/triton-benchmarks.yml
     with:
       runner_label: b580
-      skip_benchmarks: "[]"
+      skip_benchmarks: "['flex_attention_benchmark_batch16-causal_mask.py']"
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -312,6 +312,17 @@ jobs:
           python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flex-attn-causal --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
           python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-torch-report.csv --benchmark flex-attn-causal --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG
 
+      - name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
+        run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
+          cd benchmarks/triton_kernels_benchmark
+          BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
+
+          source ../../scripts/capture-hw-details.sh
+          python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-triton-report.csv --benchmark flex-attn-causal-batch4 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-torch-report.csv --benchmark flex-attn-causal-batch4 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG
+
       - name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
         run: |