@@ -250,17 +250,32 @@ jobs:
250250 if : ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
251251 run : |
252252 cd benchmarks/triton_kernels_benchmark
253+ TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
253254 python flash_attention_fwd_benchmark.py --reports $REPORTS
254255
255256 source ../../scripts/capture-hw-details.sh
256257 python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
257258 python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
258259
260+ - name : Run Triton FA kernel benchmark (+ reduction)
261+ if : ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
262+ run : |
263+ cd benchmarks/triton_kernels_benchmark
264+ TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
265+ TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
266+ python flash_attention_fwd_benchmark.py --reports $REPORTS
267+
268+ TAG="${TAG}-reduction"
269+ source ../../scripts/capture-hw-details.sh
270+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
271+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
272+
259273 - name : Run Triton FA kernel benchmark - default path
260274 if : ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark || '[]'), 'flash_attention_fwd_benchmark.py_default') }}
261275 run : |
262276 cd benchmarks/triton_kernels_benchmark
263277 TRITON_INTEL_ADVANCED_PATH=0 \
278+ TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
264279 TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
265280 IGC_VISAOptions=" -enableBCR" \
266281 python flash_attention_fwd_benchmark.py --reports $REPORTS
0 commit comments