cuda-perf #26
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: cuda-perf | |
| on: | |
| schedule: | |
| - cron: 0 8 * * * # 1am PST (8am UTC) | |
| pull_request: | |
| paths: | |
| - .github/workflows/cuda-perf.yml | |
| - .ci/scripts/cuda_benchmark.py | |
| - .ci/scripts/export_model_artifact.sh | |
| - .ci/scripts/test_model_e2e.sh | |
| push: | |
| branches: | |
| - main | |
| paths: | |
| - .github/workflows/cuda-perf.yml | |
| - .ci/scripts/cuda_benchmark.py | |
| - .ci/scripts/export_model_artifact.sh | |
| - .ci/scripts/test_model_e2e.sh | |
| workflow_dispatch: | |
| inputs: | |
| models: | |
| description: Models to be benchmarked (comma-separated HuggingFace model IDs) | |
| required: false | |
| type: string | |
| default: openai/whisper-small | |
| quantizations: | |
| description: Quantization types (comma-separated) | |
| required: false | |
| type: string | |
| default: non-quantized | |
| num_runs: | |
| description: Number of benchmark runs per model | |
| required: false | |
| type: string | |
| default: "50" | |
| random_model: | |
| description: Run a random model instead of all models | |
| required: false | |
| type: boolean | |
| default: false | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| jobs: | |
| set-parameters: | |
| runs-on: ubuntu-22.04 | |
| outputs: | |
| benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: 'false' | |
| - uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Set parameters | |
| id: set-parameters | |
| shell: bash | |
| env: | |
| # All available models and quantizations | |
| ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it' | |
| ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' | |
| NUM_RUNS: ${{ inputs.num_runs || '50' }} | |
| RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || inputs.random_model || 'false' }} | |
| run: | | |
| set -eux | |
| MODELS="${{ inputs.models }}" | |
| QUANTIZATIONS="${{ inputs.quantizations }}" | |
| # For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization | |
| if [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then | |
| # Split all models into array | |
| IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS" | |
| # Randomly select one model | |
| RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]})) | |
| MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}" | |
| echo "Randomly selected model for PR/push: $MODELS" | |
| elif [ -z "$MODELS" ]; then | |
| # Schedule event: use all models | |
| MODELS="$ALL_MODELS" | |
| fi | |
| if [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then | |
| # Split all quantizations into array | |
| IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS" | |
| # Randomly select one quantization | |
| RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]})) | |
| QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}" | |
| echo "Randomly selected quantization for PR/push: $QUANTIZATIONS" | |
| elif [ -z "$QUANTIZATIONS" ]; then | |
| # Schedule event: use all quantizations | |
| QUANTIZATIONS="$ALL_QUANTIZATIONS" | |
| fi | |
| # Split models and quantizations into arrays | |
| IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" | |
| IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS" | |
| # If random model is requested (for main branch push), select one random model from the already selected models | |
| if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then | |
| RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]})) | |
| MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}" | |
| MODEL_ARRAY=("$MODELS") | |
| echo "Random model selected for main branch push: $MODELS" | |
| fi | |
| # Generate benchmark configs | |
| CONFIGS='{"include":[' | |
| FIRST=true | |
| for MODEL in "${MODEL_ARRAY[@]}"; do | |
| for QUANT in "${QUANT_ARRAY[@]}"; do | |
| if [ "$FIRST" = true ]; then | |
| FIRST=false | |
| else | |
| CONFIGS+=',' | |
| fi | |
| # Sanitize model name for use in artifact paths | |
| MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g') | |
| CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}" | |
| done | |
| done | |
| CONFIGS+=']}' | |
| echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT | |
| echo "Generated benchmark configs:" | |
| echo "$CONFIGS" | python -m json.tool | |
| export-models: | |
| name: export-models | |
| needs: set-parameters | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| secrets: inherit | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
| fail-fast: false | |
| with: | |
| timeout: 90 | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| runner: linux.g5.4xlarge.nvidia.gpu | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "12.6" | |
| use-custom-docker-registry: false | |
| submodules: recursive | |
| upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup ExecuTorch" | |
| ./install_executorch.sh | |
| echo "::endgroup::" | |
| echo "::group::Setup Huggingface" | |
| pip install -U "huggingface_hub[cli]<1.0" accelerate | |
| huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} | |
| echo "::endgroup::" | |
| echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}" | |
| OUTPUT_DIR="model_artifacts" | |
| mkdir -p "$OUTPUT_DIR" | |
| bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR" | |
| # Move artifacts to RUNNER_ARTIFACT_DIR for upload | |
| mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/" | |
| ls -lah "${RUNNER_ARTIFACT_DIR}" | |
| echo "::endgroup::" | |
| benchmark-cuda: | |
| name: benchmark-cuda | |
| needs: | |
| - set-parameters | |
| - export-models | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
| fail-fast: false | |
| with: | |
| timeout: 90 | |
| runner: linux.g5.4xlarge.nvidia.gpu | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "12.6" | |
| use-custom-docker-registry: false | |
| submodules: recursive | |
| download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup environment" | |
| ./install_requirements.sh | |
| pip list | |
| echo "::endgroup::" | |
| echo "::group::Prepare model artifacts" | |
| mkdir -p model_artifacts | |
| cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte | |
| cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd | |
| # Copy additional files if they exist | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/ | |
| fi | |
| # Copy tokenizer files | |
| for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/ | |
| fi | |
| done | |
| ls -lah model_artifacts/ | |
| echo "::endgroup::" | |
| echo "::group::Build runner" | |
| bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts | |
| echo "::endgroup::" | |
| echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs" | |
| export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH | |
| # Get GPU name using nvidia-smi | |
| GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1) | |
| echo "Detected GPU: $GPU_NAME" | |
| # Get CUDA driver version | |
| CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1) | |
| echo "CUDA Driver Version: $CUDA_DRIVER_VERSION" | |
| # Create results directory (separate from model artifacts) | |
| RESULTS_DIR="benchmark_results" | |
| mkdir -p "$RESULTS_DIR" | |
| # Determine model name and runner command based on model | |
| case "${{ matrix.model }}" in | |
| mistralai/Voxtral-Mini-3B-2507) | |
| RUNNER="cmake-out/examples/models/voxtral/voxtral_runner" | |
| PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte" | |
| TOKENIZER="model_artifacts/tekken.json" | |
| AUDIO="model_artifacts/poem.wav" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" | |
| MODEL_NAME="voxtral_${{ matrix.quant }}" | |
| ;; | |
| openai/whisper-*) | |
| RUNNER="cmake-out/examples/models/whisper/whisper_runner" | |
| PREPROCESSOR="model_artifacts/whisper_preprocessor.pte" | |
| AUDIO="model_artifacts/output.wav" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" | |
| MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }} | |
| ;; | |
| google/gemma-3-4b-it) | |
| RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner" | |
| IMAGE="docs/source/_static/img/et-logo.png" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0" | |
| MODEL_NAME="gemma3_${{ matrix.quant }}" | |
| ;; | |
| *) | |
| echo "Error: Unsupported model '${{ matrix.model }}'" | |
| exit 1 | |
| ;; | |
| esac | |
| # Run benchmark using cuda_benchmark.py | |
| python .ci/scripts/cuda_benchmark.py \ | |
| --runner_command "$RUNNER_CMD" \ | |
| --model_name "$MODEL_NAME" \ | |
| --num_runs "${{ matrix.num_runs }}" \ | |
| --output_json "$RESULTS_DIR/benchmark_results.json" \ | |
| --output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \ | |
| --model "${{ matrix.model }}" \ | |
| --quantization "${{ matrix.quant }}" \ | |
| --git_sha "${{ github.sha }}" \ | |
| --workflow_run_id "${{ github.run_id }}" \ | |
| --workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| --gpu_name "$GPU_NAME" \ | |
| --cuda_driver_version "$CUDA_DRIVER_VERSION" | |
| # Save additional metadata | |
| cat > "$RESULTS_DIR/metadata.json" <<EOF | |
| { | |
| "model": "${{ matrix.model }}", | |
| "quantization": "${{ matrix.quant }}", | |
| "num_runs": ${{ matrix.num_runs }}, | |
| "runner": "$RUNNER", | |
| "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "git_sha": "${{ github.sha }}", | |
| "workflow_run_id": "${{ github.run_id }}", | |
| "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| EOF | |
| # Only copy benchmark results to RUNNER_ARTIFACT_DIR for upload (not the entire model) | |
| # First, clean up the downloaded model artifacts from RUNNER_ARTIFACT_DIR | |
| rm -rf "${RUNNER_ARTIFACT_DIR}"/* | |
| # Then copy only the benchmark result JSON files | |
| cp "$RESULTS_DIR"/*.json "${RUNNER_ARTIFACT_DIR}/" | |
| echo "Benchmark results prepared for upload:" | |
| ls -lah "${RUNNER_ARTIFACT_DIR}" | |
| echo "::endgroup::" | |
| upload-benchmark-results: | |
| needs: | |
| - benchmark-cuda | |
| if: always() | |
| runs-on: ubuntu-22.04 | |
| environment: upload-benchmark-results | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: false | |
| - name: Setup Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Download all benchmark results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: results-* | |
| path: all_results/ | |
| - name: Process and display results | |
| shell: bash | |
| run: | | |
| set -eux | |
| echo "::group::Benchmark Results Summary" | |
| for RESULT_DIR in all_results/results-*/; do | |
| if [ -f "$RESULT_DIR/benchmark_results.json" ]; then | |
| echo "" | |
| echo "================================" | |
| echo "Results from: $(basename "$RESULT_DIR")" | |
| echo "================================" | |
| # Display benchmark results (mean performance) | |
| cat "$RESULT_DIR/benchmark_results.json" | python -m json.tool | |
| # Display metadata | |
| if [ -f "$RESULT_DIR/metadata.json" ]; then | |
| echo "" | |
| echo "--- Metadata ---" | |
| cat "$RESULT_DIR/metadata.json" | python -m json.tool | |
| fi | |
| echo "" | |
| fi | |
| done | |
| echo "::endgroup::" | |
| - name: Authenticate with AWS | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Upload to S3 | |
| shell: bash | |
| env: | |
| S3_BUCKET: gha-artifacts | |
| S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }} | |
| run: | | |
| set -eux | |
| pip install awscli | |
| echo "Uploading benchmark results to S3..." | |
| aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \ | |
| --exclude "*" \ | |
| --include "*.json" \ | |
| --include "*.log" | |
| echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/" | |
| - name: Prepare v3 results for dashboard upload | |
| shell: bash | |
| run: | | |
| set -eux | |
| echo "::group::Prepare v3 results" | |
| mkdir -p benchmark-results/v3 | |
| # Collect all v3 results into a single directory | |
| for RESULT_DIR in all_results/results-*/; do | |
| if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then | |
| # Generate unique filename based on directory name | |
| FILENAME=$(basename "$RESULT_DIR") | |
| cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json" | |
| echo "✓ Copied $FILENAME v3 results" | |
| fi | |
| done | |
| echo "V3 results prepared:" | |
| ls -lah benchmark-results/v3/ | |
| echo "::endgroup::" | |
| - name: Upload benchmark results to dashboard | |
| uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main | |
| with: | |
| benchmark-results-dir: benchmark-results/v3 | |
| dry-run: false | |
| schema-version: v3 | |
| github-token: ${{ secrets.GITHUB_TOKEN }} |