Triton benchmarks #3350
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Triton benchmarks | |
| run-name: ${{ inputs.run_name }} | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| runner_label: | |
| description: Runner label, keep empty for default | |
| type: string | |
| default: "" | |
| tag: | |
| description: Tag for benchmark results | |
| type: string | |
| default: "test" | |
| benchmarking_method: | |
| description: The method used to obtain performance numbers | |
| type: choice | |
| options: | |
| - ELAPSED_TIME | |
| - UPSTREAM_PYTORCH_PROFILER | |
| - PROTON_PROFILER | |
| default: UPSTREAM_PYTORCH_PROFILER | |
| verify: | |
| description: Verify the benchmark results | |
| type: boolean | |
| default: true | |
| run_name: | |
| description: Run name | |
| type: string | |
| default: "Triton benchmarks" | |
| n_runs: | |
| description: Number of runs for each benchmark | |
| type: number | |
| default: 1 | |
| benchmarks: | |
| description: JSON list of benchmarks to run. Leave empty to run all benchmarks. | |
| type: string | |
| default: "" | |
| skip_benchmarks: | |
| description: JSON list of benchmarks to skip | |
| type: string | |
| default: "[]" | |
| # This workflow is also called from workflows triton-benchmarks-*.yml. | |
| workflow_call: | |
| inputs: | |
| runner_label: | |
| description: Runner label | |
| type: string | |
| skip_benchmarks: | |
| description: JSON list of benchmarks to skip | |
| type: string | |
| default: "[]" | |
| # Cancels in-progress PR runs when the PR is updated. Manual runs are never cancelled. | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id || github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'keep-going') && github.run_id || github.event.pull_request.number) || github.ref }} | |
| cancel-in-progress: true | |
| permissions: read-all | |
| env: | |
| PYTHON_VERSION: "3.10" | |
| BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }} | |
| VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} | |
| TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} | |
| N_RUNS: ${{ inputs.n_runs || '1' }} | |
| jobs: | |
| build: | |
| name: Triton benchmarks | |
| runs-on: | |
| - linux | |
| - ${{ inputs.runner_label || 'max1550' }} | |
| timeout-minutes: 720 | |
| defaults: | |
| run: | |
| shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" | |
| steps: | |
| - name: Print inputs | |
| run: | | |
| cat <<EOF | |
| ${{ toJSON(inputs) }} | |
| EOF | |
| - name: Checkout repository | |
| uses: actions/checkout@v5 | |
| - name: Install Python (from pyenv) ${{ inputs.python_version }} | |
| uses: ./.github/actions/setup-pyenv-python | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| - name: Identify Python version | |
| run: | | |
| PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')" | |
| echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV | |
| - name: Install Python build dependencies | |
| run: | | |
| pip install cmake | |
| - name: Setup PyTorch | |
| uses: ./.github/actions/setup-pytorch | |
| - name: Setup Triton | |
| uses: ./.github/actions/setup-triton | |
| - name: Create reports dir | |
| run: | | |
| mkdir reports | |
| echo "REPORTS=$PWD/reports" >> $GITHUB_ENV | |
| - name: Install benchmarks | |
| id: install | |
| run: | | |
| cd benchmarks | |
| pip install . | |
| - name: Build PTI | |
| run: | | |
| ./scripts/install-pti.sh --build-level-zero | |
| PTI_LIBS_DIR=$(python ./scripts/pti_lib.py) | |
| ls $PTI_LIBS_DIR | |
| echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV | |
| - name: Run Triton GEMM kernel benchmark - with tensor of pointer | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
| python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG | |
| if [[ "${{ inputs.runner_label || 'max1550' }}" != "lnl" ]]; then | |
| python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG | |
| fi | |
| - name: Run Triton GEMM kernel benchmark - with tensor descriptor | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
| python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG | |
| if [[ "${{ inputs.runner_label || 'max1550' }}" != "lnl" ]]; then | |
| python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG | |
| fi | |
| - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flex-attn-causal --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-torch-report.csv --benchmark flex-attn-causal --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG | |
| - name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-triton-report.csv --benchmark flex-attn-causal-batch4 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch4-torch-report.csv --benchmark flex-attn-causal-batch4 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG | |
| - name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch16-triton-report.csv --benchmark flex-attn-causal-batch16 --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-batch16-torch-report.csv --benchmark flex-attn-causal-batch16 --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG | |
| - name: Run Triton FlexAttention Causal Mask bwd kernel benchmark | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_bwd_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_bwd_benchmark_causal_mask.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| FA_KERNEL_MODE='bwd' \ | |
| python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-bwd-triton-report.csv --benchmark flex-attn-causal-bwd --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
| python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-bwd-torch-report.csv --benchmark flex-attn-causal-bwd --compiler torch --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Torch-TFlops --hbm_col "Torch-GB/s" --tag $TAG | |
| - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} | |
| run: | | |
| export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH | |
| cd benchmarks/triton_kernels_benchmark | |
| python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS | |
| source ../../scripts/capture-hw-details.sh | |
| python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-triton-report.csv --benchmark flex-attn-masks --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG --mask | |
| if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then | |
| # FIXME: XPU out of memory | |
| python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flex-attn-masks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask | |
| fi | |
| - name: Upload benchmark reports | |
| if: ${{ steps.install.outcome == 'success' && !cancelled() }} | |
| uses: actions/upload-artifact@v5 | |
| with: | |
| name: benchmark-reports | |
| path: reports |