Skip to content

Third party benchmarks #351

Third party benchmarks

Third party benchmarks #351

name: Third party benchmarks
on:
workflow_dispatch:
inputs:
runner_label:
description: Runner label, keep empty for default
type: string
default: ""
tag:
description: Tag for benchmark results
type: string
default: "test"
benchmarks:
description: JSON list of benchmarks to run. Leave empty to run all benchmarks.
type: string
default: ""
schedule:
# About midnight PST (UTC-8)
- cron: "5 10 * * *"
permissions: read-all
env:
PYTHON_VERSION: "3.10"
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
HF_TOKEN: ${{ secrets.HF_TOKEN || '' }}
jobs:
build:
name: Third party benchmarks
runs-on:
- linux
- ${{ inputs.runner_label || 'max1550' }}
timeout-minutes: 720
defaults:
run:
shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
steps:
- name: Print inputs
run: |
cat <<EOF
${{ toJSON(inputs) }}
EOF
- name: Checkout repository
uses: actions/checkout@v6
- name: Install Python (from pyenv) ${{ inputs.python_version }}
uses: ./.github/actions/setup-pyenv-python
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Identify Python version
run: |
PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
- name: Install Python build dependencies
run: |
pip install cmake
- name: Setup PyTorch
uses: ./.github/actions/setup-pytorch
- name: Setup Triton
uses: ./.github/actions/setup-triton
- name: Create reports dir
run: |
mkdir reports
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
- name: Install benchmark dependencies
id: install
run: |
pip install transformers pandas pytest
- name: Install benchmarks
id: install-benchmarks
run: |
cd benchmarks
pip install .
- name: Run sglang benchmark int8
if: ${{ steps.install-benchmarks.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'sglang')) }}
run: |
source ./scripts/capture-hw-details.sh
./scripts/test-triton.sh --install-sglang --skip-pip-install --skip-pytorch-install
cd benchmarks/third_party/sglang
python scaled_mm_benchmark.py --reports $REPORTS
python ../vllm/transform_results.py \
$REPORTS/scaled_mm_benchmark.csv \
$REPORTS/scaled-mm-int8-report.csv \
--tag $TAG \
--bgroup sglang \
--benchmark scaled-mm-int8 \
--param_cols="M,N,K"
- name: Run sglang benchmark with fp8
if: ${{ steps.install-benchmarks.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'sglang')) }}
run: |
source ./scripts/capture-hw-details.sh
cd benchmarks/third_party/sglang
FP8="1" python scaled_mm_benchmark.py --reports $REPORTS
python ../vllm/transform_results.py \
$REPORTS/scaled_mm_benchmark.csv \
$REPORTS/scaled-mm-fp8-report.csv \
--tag $TAG \
--bgroup sglang \
--benchmark scaled-mm-fp8 \
--param_cols="M,N,K"
- name: Install vllm
id: install-vllm
if: ${{ steps.install-benchmarks.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh
./scripts/test-triton.sh --install-vllm --skip-pip-install --skip-pytorch-install
- name: Run vllm unified attention bf16
if: ${{ steps.install-vllm.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh
cd benchmarks/third_party/vllm
python unified_attention_benchmark.py --reports $REPORTS
python transform_results.py \
$REPORTS/unified-attention-performance.csv \
$REPORTS/unified-attention-report.csv \
--tag $TAG \
--bgroup "vllm" \
--benchmark "unified-attn-bf16" \
--param_cols "q_heads,k_heads,head_size,dtype,qdtype,seq_lens,sliding_window,soft_cap,num_blocks,block_size"
- name: Run vllm batched moe bf16
if: ${{ steps.install-vllm.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh
cp -r vllm/tests benchmarks/third_party/vllm/tests
cd benchmarks/third_party/vllm
python batched_moe_benchmark.py --reports $REPORTS
python transform_results.py \
$REPORTS/moe-gemm-performance.csv \
$REPORTS/moe-gemm-report.csv \
--tag $TAG \
--bgroup vllm \
--benchmark moe-bf16-benchmark \
--param_cols="num_experts,max_tokens_per_expert,K,N"
- name: Run vllm batched moe fp8
if: ${{ steps.install-vllm.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh
cd benchmarks/third_party/vllm
FP8="1" python batched_moe_benchmark.py --reports $REPORTS
python transform_results.py \
$REPORTS/moe-gemm-performance.csv \
$REPORTS/moe-gemm-fp8-report.csv \
--tag $TAG \
--bgroup vllm \
--benchmark moe-fp8-benchmark \
--param_cols="num_experts,max_tokens_per_expert,K,N"
- name: Run Liger-Kernel benchmarks
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'liger')) }}
run: |
source ./scripts/capture-hw-details.sh
./scripts/test-triton.sh --install-liger --skip-pip-install --skip-pytorch-install
# To remember return code, but still copy results
RET_CODE=0
bash benchmarks/third_party/liger/run_benchmarks.sh || RET_CODE=$?
cp Liger-Kernel/benchmark/data/all_benchmark_data.csv $REPORTS/liger-raw.csv
python benchmarks/third_party/liger/transform.py \
$REPORTS/liger-raw.csv \
$REPORTS/liger-report.csv \
--tag $TAG
# Return the captured return code at the end
exit "$RET_CODE"
- name: Run e2e Llama 3.1 flex attention performance benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'llama3-1')) }}
run: |
source ./scripts/capture-hw-details.sh
git clone https://github.com/huggingface/transformers.git
cd transformers
git checkout $(<../benchmarks/third_party/e2e-flex_attention/transformers-commit.txt)
git apply ../benchmarks/third_party/e2e-flex_attention/transformers-patch-for-timing.diff
git submodule sync
git submodule update --init --recursive
pip install -v -e .
cd ../benchmarks/third_party/e2e-flex_attention
MODEL_NAME="meta-llama/Llama-3.1-8B"
MAX_NEW_TOKENS=128
INPUT_TOKENS=1024
BATCH_SIZE=1
python run_llm_inductor_greedy.py -m $MODEL_NAME --max-new-tokens $MAX_NEW_TOKENS --input-tokens $INPUT_TOKENS --num-warmup 2 --num-iter 7 --compile --profile | tee llm.compile.xpu.profile.log
echo "LLM profiling log is stored into $PWD/llm.compile.xpu.profile.log"
cp llm.compile.xpu.profile.log $REPORTS/llm.compile.xpu.profile.log
python transform_results.py $REPORTS/llm.compile.xpu.profile.log $REPORTS/llm-triton-report.csv \
--tag $TAG \
--model "$MODEL_NAME" \
--max-new-tokens $MAX_NEW_TOKENS \
--batch-size $BATCH_SIZE
- name: Run launch microbenchmark tests
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'launch_micro_benchmarks')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'launch_micro_benchmarks') }}
run: |
source scripts/capture-hw-details.sh
python python/test/microbenchmark/launch_overhead.py --reports $REPORTS
python benchmarks/third_party/vllm/transform_results.py $REPORTS/launch_overhead_results.csv $REPORTS/launch_overhead-report.csv \
--tag $TAG \
--bgroup overhead \
--benchmark launch-overhead \
--param_cols="input_type"
- name: Upload benchmark reports
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
uses: actions/upload-artifact@v5
with:
name: benchmark-reports
path: reports