add cuda benchmark ci #4
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: cuda-perf | |
| on: | |
| schedule: | |
| - cron: 0 8 * * * # 1am PST (8am UTC) | |
| pull_request: | |
| paths: | |
| - .github/workflows/cuda-perf.yml | |
| - .ci/scripts/cuda_benchmark.py | |
| - .ci/scripts/export_model_artifact.sh | |
| - .ci/scripts/test_model_e2e.sh | |
| push: | |
| branches: | |
| - main | |
| paths: | |
| - .github/workflows/cuda-perf.yml | |
| - .ci/scripts/cuda_benchmark.py | |
| - .ci/scripts/export_model_artifact.sh | |
| - .ci/scripts/test_model_e2e.sh | |
| workflow_dispatch: | |
| inputs: | |
| models: | |
| description: Models to be benchmarked (comma-separated HuggingFace model IDs) | |
| required: false | |
| type: string | |
| default: openai/whisper-small | |
| quantizations: | |
| description: Quantization types (comma-separated) | |
| required: false | |
| type: string | |
| default: non-quantized | |
| num_runs: | |
| description: Number of benchmark runs per model | |
| required: false | |
| type: string | |
| default: "50" | |
| random_model: | |
| description: Run a random model instead of all models | |
| required: false | |
| type: boolean | |
| default: false | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| jobs: | |
| set-parameters: | |
| runs-on: ubuntu-22.04 | |
| outputs: | |
| benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: 'false' | |
| - uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Set parameters | |
| id: set-parameters | |
| shell: bash | |
| env: | |
| # Default models for scheduled runs (all models) vs PR/manual runs | |
| CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it' || 'openai/whisper-small' }} | |
| CRON_DEFAULT_QUANTIZATIONS: ${{ github.event_name == 'schedule' && 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' || 'non-quantized' }} | |
| NUM_RUNS: ${{ inputs.num_runs || '50' }} | |
| RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || inputs.random_model || 'false' }} | |
| run: | | |
| set -eux | |
| MODELS="${{ inputs.models }}" | |
| if [ -z "$MODELS" ]; then | |
| MODELS="$CRON_DEFAULT_MODELS" | |
| fi | |
| QUANTIZATIONS="${{ inputs.quantizations }}" | |
| if [ -z "$QUANTIZATIONS" ]; then | |
| QUANTIZATIONS="$CRON_DEFAULT_QUANTIZATIONS" | |
| fi | |
| # Split models and quantizations into arrays | |
| IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" | |
| IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS" | |
| # If random model is requested (PR merge), select one random model | |
| if [ "$RANDOM_MODEL" = "true" ]; then | |
| RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]})) | |
| MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}" | |
| MODEL_ARRAY=("$MODELS") | |
| echo "Random model selected for PR merge: $MODELS" | |
| fi | |
| # Generate benchmark configs | |
| CONFIGS='{"include":[' | |
| FIRST=true | |
| for MODEL in "${MODEL_ARRAY[@]}"; do | |
| for QUANT in "${QUANT_ARRAY[@]}"; do | |
| if [ "$FIRST" = true ]; then | |
| FIRST=false | |
| else | |
| CONFIGS+=',' | |
| fi | |
| # Sanitize model name for use in artifact paths | |
| MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g') | |
| CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}" | |
| done | |
| done | |
| CONFIGS+=']}' | |
| echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT | |
| echo "Generated benchmark configs:" | |
| echo "$CONFIGS" | python -m json.tool | |
| export-models: | |
| name: export-models | |
| needs: set-parameters | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| secrets: inherit | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
| fail-fast: false | |
| with: | |
| timeout: 90 | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| runner: linux.g5.4xlarge.nvidia.gpu | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "12.6" | |
| use-custom-docker-registry: false | |
| submodules: recursive | |
| upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup ExecuTorch" | |
| ./install_executorch.sh | |
| echo "::endgroup::" | |
| echo "::group::Setup Huggingface" | |
| pip install -U "huggingface_hub[cli]<1.0" accelerate | |
| huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} | |
| echo "::endgroup::" | |
| echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}" | |
| OUTPUT_DIR="model_artifacts" | |
| mkdir -p "$OUTPUT_DIR" | |
| bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR" | |
| # Move artifacts to RUNNER_ARTIFACT_DIR for upload | |
| mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/" | |
| ls -lah "${RUNNER_ARTIFACT_DIR}" | |
| echo "::endgroup::" | |
| benchmark-cuda: | |
| name: benchmark-cuda | |
| needs: | |
| - set-parameters | |
| - export-models | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
| fail-fast: false | |
| with: | |
| timeout: 90 | |
| runner: linux.g5.4xlarge.nvidia.gpu | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "12.6" | |
| use-custom-docker-registry: false | |
| submodules: recursive | |
| download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup environment" | |
| ./install_requirements.sh | |
| pip list | |
| echo "::endgroup::" | |
| echo "::group::Prepare model artifacts" | |
| mkdir -p model_artifacts | |
| cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte | |
| cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd | |
| # Copy additional files if they exist | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/ | |
| fi | |
| # Copy tokenizer files | |
| for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/ | |
| fi | |
| done | |
| ls -lah model_artifacts/ | |
| echo "::endgroup::" | |
| echo "::group::Build runner" | |
| bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts | |
| echo "::endgroup::" | |
| echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs" | |
| export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH | |
| # Create results directory | |
| RESULTS_DIR="${RUNNER_ARTIFACT_DIR}" | |
| mkdir -p "$RESULTS_DIR" | |
| # Determine model name and runner command based on model | |
| case "${{ matrix.model }}" in | |
| mistralai/Voxtral-Mini-3B-2507) | |
| RUNNER="cmake-out/examples/models/voxtral/voxtral_runner" | |
| PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte" | |
| TOKENIZER="model_artifacts/tekken.json" | |
| AUDIO="model_artifacts/poem.wav" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" | |
| MODEL_NAME="voxtral_${{ matrix.quant }}" | |
| ;; | |
| openai/whisper-*) | |
| RUNNER="cmake-out/examples/models/whisper/whisper_runner" | |
| PREPROCESSOR="model_artifacts/whisper_preprocessor.pte" | |
| AUDIO="model_artifacts/output.wav" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0 --model_name whisper_large_v3" | |
| MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }} | |
| ;; | |
| google/gemma-3-4b-it) | |
| RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner" | |
| IMAGE="docs/source/_static/img/et-logo.png" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0" | |
| MODEL_NAME="gemma3_${{ matrix.quant }}" | |
| ;; | |
| *) | |
| echo "Error: Unsupported model '${{ matrix.model }}'" | |
| exit 1 | |
| ;; | |
| esac | |
| # Run benchmark using cuda_benchmark.py | |
| python .ci/scripts/cuda_benchmark.py \ | |
| --runner_command "$RUNNER_CMD" \ | |
| --model_name "$MODEL_NAME" \ | |
| --num_runs "${{ matrix.num_runs }}" \ | |
| --output_json "$RESULTS_DIR/benchmark_results.json" \ | |
| --fix_gpu_clock | |
| # Save additional metadata | |
| cat > "$RESULTS_DIR/metadata.json" <<EOF | |
| { | |
| "model": "${{ matrix.model }}", | |
| "quantization": "${{ matrix.quant }}", | |
| "num_runs": ${{ matrix.num_runs }}, | |
| "runner": "$RUNNER", | |
| "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "git_sha": "${{ github.sha }}", | |
| "workflow_run_id": "${{ github.run_id }}", | |
| "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| EOF | |
| echo "::endgroup::" | |
| upload-benchmark-results: | |
| needs: | |
| - benchmark-cuda | |
| if: always() | |
| runs-on: ubuntu-22.04 | |
| environment: upload-benchmark-results | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: false | |
| - name: Setup Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Download all benchmark results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: results-* | |
| path: all_results/ | |
| - name: Process and display results | |
| shell: bash | |
| run: | | |
| set -eux | |
| echo "::group::Benchmark Results Summary" | |
| for RESULT_DIR in all_results/results-*/; do | |
| if [ -f "$RESULT_DIR/benchmark_results.json" ]; then | |
| echo "" | |
| echo "================================" | |
| echo "Results from: $(basename "$RESULT_DIR")" | |
| echo "================================" | |
| # Display benchmark results (mean performance) | |
| cat "$RESULT_DIR/benchmark_results.json" | python -m json.tool | |
| # Display metadata | |
| if [ -f "$RESULT_DIR/metadata.json" ]; then | |
| echo "" | |
| echo "--- Metadata ---" | |
| cat "$RESULT_DIR/metadata.json" | python -m json.tool | |
| fi | |
| echo "" | |
| fi | |
| done | |
| echo "::endgroup::" | |
| - name: Authenticate with AWS | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Upload to S3 | |
| shell: bash | |
| env: | |
| S3_BUCKET: gha-artifacts | |
| S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }} | |
| run: | | |
| set -eux | |
| pip install awscli | |
| echo "Uploading benchmark results to S3..." | |
| aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \ | |
| --exclude "*" \ | |
| --include "*.json" \ | |
| --include "*.log" | |
| echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/" | |
| # TODO: Future enhancement - parse results and upload to benchmark dashboard | |
| # Similar to apple-perf.yml's extract_benchmark_results.py approach | |
| # This would require: | |
| # 1. Parsing the benchmark output logs to extract metrics | |
| # 2. Converting to the v3 benchmark results format | |
| # 3. Uploading using pytorch/test-infra/.github/actions/upload-benchmark-results@main |