Skip to content

Use the same driver for the upstream profiler as for the legacy profiler with IPEX #739

Use the same driver for the upstream profiler as for the legacy profiler with IPEX

Use the same driver for the upstream profiler as for the legacy profiler with IPEX #739

name: Triton benchmarks
on:
workflow_dispatch:
inputs:
runner_label:
description: Runner label, keep empty for default
type: string
default: ""
tag:
description: Tag for benchmark results
type: string
default: "test"
benchmarking_method:
description: The method used to obtain performance numbers
type: choice
options:
- PYTORCH_LEGACY_PROFILER_USING_IPEX
- ELAPSED_TIME
- UPSTREAM_PYTORCH_PROFILER
default: PYTORCH_LEGACY_PROFILER_USING_IPEX
schedule:
- cron: "5 23 * * *"
pull_request:
branches:
- main
paths:
- .github/workflows/triton-benchmarks.yml
- benchmarks/**
permissions: read-all
env:
PYTHON_VERSION: "3.10"
BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'PYTORCH_LEGACY_PROFILER_USING_IPEX' }}
USE_IPEX: ${{ github.event_name != 'workflow_dispatch' && '1' || inputs.benchmarking_method == 'PYTORCH_LEGACY_PROFILER_USING_IPEX' && '1' || '0' }}
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && 'pr') || (github.event_name == 'schedule' && 'ci') || 'test' }}
jobs:
build:
name: Triton benchmarks
runs-on:
- ${{ inputs.runner_label || 'max1550' }}
timeout-minutes: 720
defaults:
run:
shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Load pip cache
id: pip-cache
uses: ./.github/actions/load
with:
path: $HOME/.cache/pip
# pip cache per commit id just to minimize network traffic
key: pip-$PYTHON_VERSION-$GITHUB_SHA
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Python build dependencies
run: |
pip install wheel cmake
- name: Setup PyTorch with IPEX
if: ${{ env.USE_IPEX == '1' }}
uses: ./.github/actions/setup-pytorch
with:
repository: Stonepia/pytorch
- name: Setup PyTorch without IPEX
if: ${{ env.USE_IPEX == '0' }}
uses: ./.github/actions/setup-pytorch
with:
repository: pytorch/pytorch
- name: Setup IPEX
if: ${{ env.USE_IPEX == '1' }}
uses: ./.github/actions/setup-ipex
- name: Build Triton wheels
uses: ./.github/actions/setup-triton
with:
command: DEBUG=1 python setup.py bdist_wheel
- name: Install Triton
run: |
pip install python/dist/*.whl
- name: Install benchmark dependencies
run: |
pip install matplotlib pandas tabulate
- name: Create reports dir
run: |
mkdir reports
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
- name: Install benchmarks
id: install
run: |
cd benchmarks
python setup.py install
- name: Run Triton Softmax kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python fused_softmax.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Default path:
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-dflt"
python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Advanced path:
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-adv"
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM (A@B^t) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
- name: Run Triton GEMM (A^t@B) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
- name: Run Triton GEMM (stream-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_streamk_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM (split-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_splitk_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_preop_exp_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_gelu_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python flash_attention_fwd_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR" \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG="${TAG}-dflt"
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_INSTR_SCHED=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR" \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG="${TAG}-adv"
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Prefix Sums kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python prefix_sums.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run micro benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/micro_benchmarks
python run_benchmarks.py --reports $REPORTS
- name: Save pip cache
if: ${{ steps.pip-cache.outputs.status == 'miss' }}
uses: ./.github/actions/save
with:
path: ${{ steps.pip-cache.outputs.path }}
dest: ${{ steps.pip-cache.outputs.dest }}
- name: Upload benchmark reports
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: benchmark-reports
path: reports