Skip to content

cuda-perf

cuda-perf #26

Workflow file for this run

name: cuda-perf
on:
schedule:
- cron: 0 8 * * * # 1am PST (8am UTC)
pull_request:
paths:
- .github/workflows/cuda-perf.yml
- .ci/scripts/cuda_benchmark.py
- .ci/scripts/export_model_artifact.sh
- .ci/scripts/test_model_e2e.sh
push:
branches:
- main
paths:
- .github/workflows/cuda-perf.yml
- .ci/scripts/cuda_benchmark.py
- .ci/scripts/export_model_artifact.sh
- .ci/scripts/test_model_e2e.sh
workflow_dispatch:
inputs:
models:
description: Models to be benchmarked (comma-separated HuggingFace model IDs)
required: false
type: string
default: openai/whisper-small
quantizations:
description: Quantization types (comma-separated)
required: false
type: string
default: non-quantized
num_runs:
description: Number of benchmark runs per model
required: false
type: string
default: "50"
random_model:
description: Run a random model instead of all models
required: false
type: boolean
default: false
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
jobs:
set-parameters:
runs-on: ubuntu-22.04
outputs:
benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Set parameters
id: set-parameters
shell: bash
env:
# All available models and quantizations
ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it'
ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
NUM_RUNS: ${{ inputs.num_runs || '50' }}
RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || inputs.random_model || 'false' }}
run: |
set -eux
MODELS="${{ inputs.models }}"
QUANTIZATIONS="${{ inputs.quantizations }}"
# For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization
if [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
# Split all models into array
IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS"
# Randomly select one model
RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]}))
MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}"
echo "Randomly selected model for PR/push: $MODELS"
elif [ -z "$MODELS" ]; then
# Schedule event: use all models
MODELS="$ALL_MODELS"
fi
if [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
# Split all quantizations into array
IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS"
# Randomly select one quantization
RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]}))
QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}"
echo "Randomly selected quantization for PR/push: $QUANTIZATIONS"
elif [ -z "$QUANTIZATIONS" ]; then
# Schedule event: use all quantizations
QUANTIZATIONS="$ALL_QUANTIZATIONS"
fi
# Split models and quantizations into arrays
IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"
# If random model is requested (for main branch push), select one random model from the already selected models
if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then
RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]}))
MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}"
MODEL_ARRAY=("$MODELS")
echo "Random model selected for main branch push: $MODELS"
fi
# Generate benchmark configs
CONFIGS='{"include":['
FIRST=true
for MODEL in "${MODEL_ARRAY[@]}"; do
for QUANT in "${QUANT_ARRAY[@]}"; do
if [ "$FIRST" = true ]; then
FIRST=false
else
CONFIGS+=','
fi
# Sanitize model name for use in artifact paths
MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g')
CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}"
done
done
CONFIGS+=']}'
echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT
echo "Generated benchmark configs:"
echo "$CONFIGS" | python -m json.tool
export-models:
name: export-models
needs: set-parameters
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
secrets: inherit
strategy:
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
fail-fast: false
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
use-custom-docker-registry: false
submodules: recursive
upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
echo "::group::Setup ExecuTorch"
./install_executorch.sh
echo "::endgroup::"
echo "::group::Setup Huggingface"
pip install -U "huggingface_hub[cli]<1.0" accelerate
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
echo "::endgroup::"
echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
OUTPUT_DIR="model_artifacts"
mkdir -p "$OUTPUT_DIR"
bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR"
# Move artifacts to RUNNER_ARTIFACT_DIR for upload
mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/"
ls -lah "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"
benchmark-cuda:
name: benchmark-cuda
needs:
- set-parameters
- export-models
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
fail-fast: false
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
use-custom-docker-registry: false
submodules: recursive
download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
echo "::group::Setup environment"
./install_requirements.sh
pip list
echo "::endgroup::"
echo "::group::Prepare model artifacts"
mkdir -p model_artifacts
cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd
# Copy additional files if they exist
if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/
fi
if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then
cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/
fi
if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then
cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/
fi
if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then
cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/
fi
if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
fi
# Copy tokenizer files
for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/
fi
done
ls -lah model_artifacts/
echo "::endgroup::"
echo "::group::Build runner"
bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts
echo "::endgroup::"
echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs"
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
# Get GPU name using nvidia-smi
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
echo "Detected GPU: $GPU_NAME"
# Get CUDA driver version
CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)
echo "CUDA Driver Version: $CUDA_DRIVER_VERSION"
# Create results directory (separate from model artifacts)
RESULTS_DIR="benchmark_results"
mkdir -p "$RESULTS_DIR"
# Determine model name and runner command based on model
case "${{ matrix.model }}" in
mistralai/Voxtral-Mini-3B-2507)
RUNNER="cmake-out/examples/models/voxtral/voxtral_runner"
PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte"
TOKENIZER="model_artifacts/tekken.json"
AUDIO="model_artifacts/poem.wav"
RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
MODEL_NAME="voxtral_${{ matrix.quant }}"
;;
openai/whisper-*)
RUNNER="cmake-out/examples/models/whisper/whisper_runner"
PREPROCESSOR="model_artifacts/whisper_preprocessor.pte"
AUDIO="model_artifacts/output.wav"
RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }}
;;
google/gemma-3-4b-it)
RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner"
IMAGE="docs/source/_static/img/et-logo.png"
RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
MODEL_NAME="gemma3_${{ matrix.quant }}"
;;
*)
echo "Error: Unsupported model '${{ matrix.model }}'"
exit 1
;;
esac
# Run benchmark using cuda_benchmark.py
python .ci/scripts/cuda_benchmark.py \
--runner_command "$RUNNER_CMD" \
--model_name "$MODEL_NAME" \
--num_runs "${{ matrix.num_runs }}" \
--output_json "$RESULTS_DIR/benchmark_results.json" \
--output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \
--model "${{ matrix.model }}" \
--quantization "${{ matrix.quant }}" \
--git_sha "${{ github.sha }}" \
--workflow_run_id "${{ github.run_id }}" \
--workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
--gpu_name "$GPU_NAME" \
--cuda_driver_version "$CUDA_DRIVER_VERSION"
# Save additional metadata
cat > "$RESULTS_DIR/metadata.json" <<EOF
{
"model": "${{ matrix.model }}",
"quantization": "${{ matrix.quant }}",
"num_runs": ${{ matrix.num_runs }},
"runner": "$RUNNER",
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"git_sha": "${{ github.sha }}",
"workflow_run_id": "${{ github.run_id }}",
"workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
EOF
# Only copy benchmark results to RUNNER_ARTIFACT_DIR for upload (not the entire model)
# First, clean up the downloaded model artifacts from RUNNER_ARTIFACT_DIR
rm -rf "${RUNNER_ARTIFACT_DIR}"/*
# Then copy only the benchmark result JSON files
cp "$RESULTS_DIR"/*.json "${RUNNER_ARTIFACT_DIR}/"
echo "Benchmark results prepared for upload:"
ls -lah "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"
upload-benchmark-results:
needs:
- benchmark-cuda
if: always()
runs-on: ubuntu-22.04
environment: upload-benchmark-results
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v3
with:
submodules: false
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Download all benchmark results
uses: actions/download-artifact@v4
with:
pattern: results-*
path: all_results/
- name: Process and display results
shell: bash
run: |
set -eux
echo "::group::Benchmark Results Summary"
for RESULT_DIR in all_results/results-*/; do
if [ -f "$RESULT_DIR/benchmark_results.json" ]; then
echo ""
echo "================================"
echo "Results from: $(basename "$RESULT_DIR")"
echo "================================"
# Display benchmark results (mean performance)
cat "$RESULT_DIR/benchmark_results.json" | python -m json.tool
# Display metadata
if [ -f "$RESULT_DIR/metadata.json" ]; then
echo ""
echo "--- Metadata ---"
cat "$RESULT_DIR/metadata.json" | python -m json.tool
fi
echo ""
fi
done
echo "::endgroup::"
- name: Authenticate with AWS
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
role-duration-seconds: 18000
aws-region: us-east-1
- name: Upload to S3
shell: bash
env:
S3_BUCKET: gha-artifacts
S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }}
run: |
set -eux
pip install awscli
echo "Uploading benchmark results to S3..."
aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \
--exclude "*" \
--include "*.json" \
--include "*.log"
echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/"
- name: Prepare v3 results for dashboard upload
shell: bash
run: |
set -eux
echo "::group::Prepare v3 results"
mkdir -p benchmark-results/v3
# Collect all v3 results into a single directory
for RESULT_DIR in all_results/results-*/; do
if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then
# Generate unique filename based on directory name
FILENAME=$(basename "$RESULT_DIR")
cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json"
echo "✓ Copied $FILENAME v3 results"
fi
done
echo "V3 results prepared:"
ls -lah benchmark-results/v3/
echo "::endgroup::"
- name: Upload benchmark results to dashboard
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
with:
benchmark-results-dir: benchmark-results/v3
dry-run: false
schema-version: v3
github-token: ${{ secrets.GITHUB_TOKEN }}