@@ -250,6 +250,19 @@ jobs:
250250 if : ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
251251 run : |
252252 cd benchmarks/triton_kernels_benchmark
253+ TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
254+ python flash_attention_fwd_benchmark.py --reports $REPORTS
255+
256+ source ../../scripts/capture-hw-details.sh
257+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
258+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
259+
260+ - name : Run Triton FA kernel benchmark (+ reduction)
261+ if : ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
262+ run : |
263+ cd benchmarks/triton_kernels_benchmark
264+ TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
265+ TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
253266 python flash_attention_fwd_benchmark.py --reports $REPORTS
254267
255268 source ../../scripts/capture-hw-details.sh
@@ -261,6 +274,7 @@ jobs:
261274 run : |
262275 cd benchmarks/triton_kernels_benchmark
263276 TRITON_INTEL_ADVANCED_PATH=0 \
277+ TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
264278 TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
265279 IGC_VISAOptions=" -enableBCR" \
266280 python flash_attention_fwd_benchmark.py --reports $REPORTS
0 commit comments