Skip to content

Triton benchmarks #3350

Triton benchmarks

Triton benchmarks #3350

name: Triton benchmarks
run-name: ${{ inputs.run_name }}
on:
workflow_dispatch:
inputs:
runner_label:
description: Runner label, keep empty for default
type: string
default: ""
tag:
description: Tag for benchmark results
type: string
default: "test"
benchmarking_method:
description: The method used to obtain performance numbers
type: choice
options:
- ELAPSED_TIME
- UPSTREAM_PYTORCH_PROFILER
- PROTON_PROFILER
default: UPSTREAM_PYTORCH_PROFILER
verify:
description: Verify the benchmark results
type: boolean
default: true
run_name:
description: Run name
type: string
default: "Triton benchmarks"
n_runs:
description: Number of runs for each benchmark
type: number
default: 1
benchmarks:
description: JSON list of benchmarks to run. Leave empty to run all benchmarks.
type: string
default: ""
skip_benchmarks:
description: JSON list of benchmarks to skip
type: string
default: "[]"
# This workflow is also called from workflows triton-benchmarks-*.yml.
workflow_call:
inputs:
runner_label:
description: Runner label
type: string
skip_benchmarks:
description: JSON list of benchmarks to skip
type: string
default: "[]"
# Cancels in-progress PR runs when the PR is updated. Manual runs are never cancelled.
concurrency:
group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id || github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'keep-going') && github.run_id || github.event.pull_request.number) || github.ref }}
cancel-in-progress: true
permissions: read-all
env:
PYTHON_VERSION: "3.10"
BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }}
VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }}
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
N_RUNS: ${{ inputs.n_runs || '1' }}
jobs:
build:
name: Triton benchmarks
runs-on:
- linux
- ${{ inputs.runner_label || 'max1550' }}
timeout-minutes: 720
defaults:
run:
shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
steps:
- name: Print inputs
run: |
cat <<EOF
${{ toJSON(inputs) }}
EOF
- name: Checkout repository
uses: actions/checkout@v5
- name: Install Python (from pyenv) ${{ inputs.python_version }}
uses: ./.github/actions/setup-pyenv-python
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Identify Python version
run: |
PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
- name: Install Python build dependencies
run: |
pip install cmake
- name: Setup PyTorch
uses: ./.github/actions/setup-pytorch
- name: Setup Triton
uses: ./.github/actions/setup-triton
- name: Create reports dir
run: |
mkdir reports
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
- name: Install benchmarks
id: install
run: |
cd benchmarks
pip install .
- name: Build PTI
run: |
./scripts/install-pti.sh --build-level-zero
PTI_LIBS_DIR=$(python ./scripts/pti_lib.py)
ls $PTI_LIBS_DIR
echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
- name: Run Triton GEMM kernel benchmark - with tensor of pointer
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
if [[ "${{ inputs.runner_label || 'max1550' }}" != "lnl" ]]; then
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
fi
- name: Run Triton GEMM kernel benchmark - with tensor descriptor
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
if [[ "${{ inputs.runner_label || 'max1550' }}" != "lnl" ]]; then
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
fi
- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flex-attn-causal --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-torch-report.csv --benchmark flex-attn-causal --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG
- name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-triton-report.csv --benchmark flex-attn-causal-batch4 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-torch-report.csv --benchmark flex-attn-causal-batch4 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG
- name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch16-triton-report.csv --benchmark flex-attn-causal-batch16 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch16-torch-report.csv --benchmark flex-attn-causal-batch16 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG
- name: Run Triton FlexAttention Causal Mask bwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_bwd_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_bwd_benchmark_causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
FA_KERNEL_MODE='bwd' \
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-bwd-triton-report.csv --benchmark flex-attn-causal-bwd --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-bwd-torch-report.csv --benchmark flex-attn-causal-bwd --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG
- name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-triton-report.csv --benchmark flex-attn-masks --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG --mask
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
# FIXME: XPU out of memory
python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flex-attn-masks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask
fi
- name: Upload benchmark reports
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
uses: actions/upload-artifact@v5
with:
name: benchmark-reports
path: reports