Triton benchmarks #3267

Workflow file for this run

.github/workflows/triton-benchmarks.yml at 91cac59

	name: Triton benchmarks
	run-name: ${{ inputs.run_name }}

	on:
	workflow_dispatch:
	inputs:
	runner_label:
	description: Runner label, keep empty for default
	type: string
	default: ""
	tag:
	description: Tag for benchmark results
	type: string
	default: "test"
	benchmarking_method:
	description: The method used to obtain performance numbers
	type: choice
	options:
	- ELAPSED_TIME
	- UPSTREAM_PYTORCH_PROFILER
	- PROTON_PROFILER
	default: UPSTREAM_PYTORCH_PROFILER
	verify:
	description: Verify the benchmark results
	type: boolean
	default: true
	run_name:
	description: Run name
	type: string
	default: "Triton benchmarks"
	n_runs:
	description: Number of runs for each benchmark
	type: number
	default: 1
	benchmarks:
	description: JSON list of benchmarks to run. Leave empty to run all benchmarks.
	type: string
	default: ""
	skip_benchmarks:
	description: JSON list of benchmarks to skip
	type: string
	default: "[]"

	# This workflow is also called from workflows triton-benchmarks-*.yml.
	workflow_call:
	inputs:
	runner_label:
	description: Runner label
	type: string
	skip_benchmarks:
	description: JSON list of benchmarks to skip
	type: string
	default: "[]"

	# Cancels in-progress PR runs when the PR is updated. Manual runs are never cancelled.
	concurrency:
	group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id \|\| github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'keep-going') && github.run_id \|\| github.event.pull_request.number) \|\| github.ref }}
	cancel-in-progress: true

	permissions: read-all

	env:
	PYTHON_VERSION: "3.10"
	BENCHMARKING_METHOD: ${{ inputs.benchmarking_method \|\| 'UPSTREAM_PYTORCH_PROFILER' }}
	VERIFY: ${{ (github.event_name == 'pull_request' \|\| github.event_name == 'schedule' \|\| inputs.verify) && '1' \|\| '0' }}
	TAG: ${{ inputs.tag \|\| (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) \|\| (github.event_name == 'schedule' && 'ci') \|\| 'test' }}
	N_RUNS: ${{ inputs.n_runs \|\| '1' }}

	jobs:
	build:
	name: Triton benchmarks
	runs-on:
	- linux
	- ${{ inputs.runner_label \|\| 'max1550' }}
	timeout-minutes: 720
	defaults:
	run:
	shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
	steps:
	- name: Print inputs
	run: \|
	cat <<EOF
	${{ toJSON(inputs) }}
	EOF

	- name: Checkout repository
	uses: actions/checkout@v5

	- name: Install Python (from pyenv) ${{ inputs.python_version }}
	uses: ./.github/actions/setup-pyenv-python
	with:
	python-version: ${{ env.PYTHON_VERSION }}

	- name: Identify Python version
	run: \|
	PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
	echo "PYTHON_VERSION=$PYTHON_VERSION" \| tee -a $GITHUB_ENV

	- name: Install Python build dependencies
	run: \|
	pip install cmake

	- name: Setup PyTorch
	uses: ./.github/actions/setup-pytorch

	- name: Setup Triton
	uses: ./.github/actions/setup-triton

	- name: Create reports dir
	run: \|
	mkdir reports
	echo "REPORTS=$PWD/reports" >> $GITHUB_ENV

	- name: Install benchmarks
	id: install
	run: \|
	cd benchmarks
	pip install .

	- name: Build PTI
	run: \|
	./scripts/install-pti.sh --build-level-zero
	PTI_LIBS_DIR=$(python ./scripts/pti_lib.py)
	ls $PTI_LIBS_DIR
	echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV

	- name: Run Triton Softmax kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'fused_softmax.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
	python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-onednn-report.csv --benchmark softmax --compiler onednn --param_cols "N" --tflops_col oneDNN-TFlops --hbm_col "oneDNN-GB/s" --tag $TAG

	- name: Run Triton Softmax kernel benchmark with Proton
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'fused_softmax.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	BENCHMARKING_METHOD=PROTON_PROFILER python fused_softmax.py
	source ../../scripts/capture-hw-details.sh

	- name: Run Triton GEMM kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
	if [[ "${{ inputs.runner_label \|\| 'max1550' }}" != "lnl" ]]; then
	python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
	fi

	- name: Run Triton GEMM kernel benchmark - with tensor of pointer
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
	if [[ "${{ inputs.runner_label \|\| 'max1550' }}" != "lnl" ]]; then
	python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
	fi

	- name: Run Triton GEMM kernel benchmark - with tensor descriptor
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_tensor_desc_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
	if [[ "${{ inputs.runner_label \|\| 'max1550' }}" != "lnl" ]]; then
	python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
	fi

	- name: Run Triton GEMM (A@B^t) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py_abt') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
	source ../../scripts/capture-hw-details.sh

	python build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG

	- name: Run Triton GEMM (A^t@B) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py_atb') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
	source ../../scripts/capture-hw-details.sh

	python build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG

	- name: Run Triton GEMM (stream-k) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_streamk_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-xetla-report.csv --benchmark gemm-streamk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

	- name: Run Triton GEMM (split-k) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_splitk_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-xetla-report.csv --benchmark gemm-splitk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

	- name: Run Triton GEMM + PreOp (exp) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_preop_exp_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_postop_gelu_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-performance-postop-addmatrix-bfloat16.csv $REPORTS/gemm-postop-addmatrix-bfloat16-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-performance-postop-addmatrix-bfloat16.csv $REPORTS/gemm-postop-addmatrix-bfloat16-onednn-report.csv --benchmark gemm-postop-addmatrix --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG

	- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/matmul-performance-postop-addmatrix-int8.csv $REPORTS/gemm-postop-addmatrix-int8-triton-report.csv --benchmark gemm-postop-addmatrix-int8 --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/matmul-performance-postop-addmatrix-int8.csv $REPORTS/gemm-postop-addmatrix-int8-onednn-report.csv --benchmark gemm-postop-addmatrix-int8 --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG

	- name: Run Triton FA fwd kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flash_attention_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark flash-attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-cutlass-report.csv --benchmark flash-attn --compiler cutlass --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG

	- name: Run Triton FA bwd kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flash_attention_bwd_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	FA_KERNEL_MODE="bwd" \
	python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	mv $REPORTS/attn-performance.csv $REPORTS/attn-bwd-performance.csv

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-triton-report.csv --benchmark flash-attn-bwd --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-xetla-report.csv --benchmark flash-attn-bwd --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

	- name: Run Triton FA fwd kernel benchmark - with tensor descriptors
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
	mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-triton-report.csv --benchmark flash-attn-tensor-desc --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-cutlass-report.csv --benchmark flash-attn-tensor-desc --compiler cutlass --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG

	- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flex_attention_benchmark_causal_mask.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flex-attn-causal --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-torch-report.csv --benchmark flex-attn-causal --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG

	- name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-triton-report.csv --benchmark flex-attn-causal-batch4 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-torch-report.csv --benchmark flex-attn-causal-batch4 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG

	- name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch16-triton-report.csv --benchmark flex-attn-causal-batch16 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch16-torch-report.csv --benchmark flex-attn-causal-batch16 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG

	- name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flex_attention_benchmark_custom_masks.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS

	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-triton-report.csv --benchmark flex-attn-masks --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG --mask
	if [[ "${{ inputs.runner_label \|\| 'max1550' }}" = "max1550" ]]; then
	# FIXME: XPU out of memory
	python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flex-attn-masks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask
	fi

	- name: Run Prefix Sums kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'prefix_sums.py') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/triton_kernels_benchmark
	python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
	source ../../scripts/capture-hw-details.sh
	python build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "M,N,AXIS" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run micro benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' \|\| contains(fromJson(inputs.benchmarks \|\| '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'micro_benchmarks') }}
	run: \|
	export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
	cd benchmarks/micro_benchmarks
	python run_benchmarks.py --reports $REPORTS

	- name: Upload benchmark reports
	if: ${{ steps.install.outcome == 'success' && !cancelled() }}
	uses: actions/upload-artifact@v5
	with:
	name: benchmark-reports
	path: reports

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Triton benchmarks #3267

Workflow file

Triton benchmarks #3267

Uh oh!

Jobs

Run details

Workflow file for this run