add cuda benchmark ci #4

Workflow file for this run

.github/workflows/cuda-perf.yml at 7e7edef

	name: cuda-perf

	on:
	schedule:
	- cron: 0 8 * * * # 1am PST (8am UTC)
	pull_request:
	paths:
	- .github/workflows/cuda-perf.yml
	- .ci/scripts/cuda_benchmark.py
	- .ci/scripts/export_model_artifact.sh
	- .ci/scripts/test_model_e2e.sh
	push:
	branches:
	- main
	paths:
	- .github/workflows/cuda-perf.yml
	- .ci/scripts/cuda_benchmark.py
	- .ci/scripts/export_model_artifact.sh
	- .ci/scripts/test_model_e2e.sh
	workflow_dispatch:
	inputs:
	models:
	description: Models to be benchmarked (comma-separated HuggingFace model IDs)
	required: false
	type: string
	default: openai/whisper-small
	quantizations:
	description: Quantization types (comma-separated)
	required: false
	type: string
	default: non-quantized
	num_runs:
	description: Number of benchmark runs per model
	required: false
	type: string
	default: "50"
	random_model:
	description: Run a random model instead of all models
	required: false
	type: boolean
	default: false

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
	cancel-in-progress: true

	jobs:
	set-parameters:
	runs-on: ubuntu-22.04
	outputs:
	benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10'
	- name: Set parameters
	id: set-parameters
	shell: bash
	env:
	# Default models for scheduled runs (all models) vs PR/manual runs
	CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it' \|\| 'openai/whisper-small' }}
	CRON_DEFAULT_QUANTIZATIONS: ${{ github.event_name == 'schedule' && 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' \|\| 'non-quantized' }}
	NUM_RUNS: ${{ inputs.num_runs \|\| '50' }}
	RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' \|\| inputs.random_model \|\| 'false' }}
	run: \|
	set -eux

	MODELS="${{ inputs.models }}"
	if [ -z "$MODELS" ]; then
	MODELS="$CRON_DEFAULT_MODELS"
	fi

	QUANTIZATIONS="${{ inputs.quantizations }}"
	if [ -z "$QUANTIZATIONS" ]; then
	QUANTIZATIONS="$CRON_DEFAULT_QUANTIZATIONS"
	fi

	# Split models and quantizations into arrays
	IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
	IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"

	# If random model is requested (PR merge), select one random model
	if [ "$RANDOM_MODEL" = "true" ]; then
	RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]}))
	MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}"
	MODEL_ARRAY=("$MODELS")
	echo "Random model selected for PR merge: $MODELS"
	fi

	# Generate benchmark configs
	CONFIGS='{"include":['
	FIRST=true
	for MODEL in "${MODEL_ARRAY[@]}"; do
	for QUANT in "${QUANT_ARRAY[@]}"; do
	if [ "$FIRST" = true ]; then
	FIRST=false
	else
	CONFIGS+=','
	fi
	# Sanitize model name for use in artifact paths
	MODEL_SAFE=$(echo "$MODEL" \| sed 's/\//_/g')
	CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}"
	done
	done
	CONFIGS+=']}'

	echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT
	echo "Generated benchmark configs:"
	echo "$CONFIGS" \| python -m json.tool

	export-models:
	name: export-models
	needs: set-parameters
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	permissions:
	id-token: write
	contents: read
	secrets: inherit
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
	fail-fast: false
	with:
	timeout: 90
	secrets-env: EXECUTORCH_HF_TOKEN
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.6"
	use-custom-docker-registry: false
	submodules: recursive
	upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux
	echo "::group::Setup ExecuTorch"
	./install_executorch.sh
	echo "::endgroup::"

	echo "::group::Setup Huggingface"
	pip install -U "huggingface_hub[cli]<1.0" accelerate
	huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
	OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
	pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
	echo "::endgroup::"

	echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
	OUTPUT_DIR="model_artifacts"
	mkdir -p "$OUTPUT_DIR"

	bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR"

	# Move artifacts to RUNNER_ARTIFACT_DIR for upload
	mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/"
	ls -lah "${RUNNER_ARTIFACT_DIR}"
	echo "::endgroup::"

	benchmark-cuda:
	name: benchmark-cuda
	needs:
	- set-parameters
	- export-models
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	permissions:
	id-token: write
	contents: read
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
	fail-fast: false
	with:
	timeout: 90
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.6"
	use-custom-docker-registry: false
	submodules: recursive
	download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
	upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux
	echo "::group::Setup environment"
	./install_requirements.sh
	pip list
	echo "::endgroup::"

	echo "::group::Prepare model artifacts"
	mkdir -p model_artifacts
	cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte
	cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd

	# Copy additional files if they exist
	if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
	fi
	# Copy tokenizer files
	for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
	if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/
	fi
	done

	ls -lah model_artifacts/
	echo "::endgroup::"

	echo "::group::Build runner"
	bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts
	echo "::endgroup::"

	echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs"
	export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH

	# Create results directory
	RESULTS_DIR="${RUNNER_ARTIFACT_DIR}"
	mkdir -p "$RESULTS_DIR"

	# Determine model name and runner command based on model
	case "${{ matrix.model }}" in
	mistralai/Voxtral-Mini-3B-2507)
	RUNNER="cmake-out/examples/models/voxtral/voxtral_runner"
	PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte"
	TOKENIZER="model_artifacts/tekken.json"
	AUDIO="model_artifacts/poem.wav"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
	MODEL_NAME="voxtral_${{ matrix.quant }}"
	;;
	openai/whisper-*)
	RUNNER="cmake-out/examples/models/whisper/whisper_runner"
	PREPROCESSOR="model_artifacts/whisper_preprocessor.pte"
	AUDIO="model_artifacts/output.wav"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0 --model_name whisper_large_v3"
	MODEL_NAME=$(echo "${{ matrix.model }}" \| sed 's/openai\///')_${{ matrix.quant }}
	;;
	google/gemma-3-4b-it)
	RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner"
	IMAGE="docs/source/_static/img/et-logo.png"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
	MODEL_NAME="gemma3_${{ matrix.quant }}"
	;;
	*)
	echo "Error: Unsupported model '${{ matrix.model }}'"
	exit 1
	;;
	esac

	# Run benchmark using cuda_benchmark.py
	python .ci/scripts/cuda_benchmark.py \
	--runner_command "$RUNNER_CMD" \
	--model_name "$MODEL_NAME" \
	--num_runs "${{ matrix.num_runs }}" \
	--output_json "$RESULTS_DIR/benchmark_results.json" \
	--fix_gpu_clock

	# Save additional metadata
	cat > "$RESULTS_DIR/metadata.json" <<EOF
	{
	"model": "${{ matrix.model }}",
	"quantization": "${{ matrix.quant }}",
	"num_runs": ${{ matrix.num_runs }},
	"runner": "$RUNNER",
	"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
	"git_sha": "${{ github.sha }}",
	"workflow_run_id": "${{ github.run_id }}",
	"workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	}
	EOF
	echo "::endgroup::"

	upload-benchmark-results:
	needs:
	- benchmark-cuda
	if: always()
	runs-on: ubuntu-22.04
	environment: upload-benchmark-results
	permissions:
	id-token: write
	contents: read
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: false

	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10'

	- name: Download all benchmark results
	uses: actions/download-artifact@v4
	with:
	pattern: results-*
	path: all_results/

	- name: Process and display results
	shell: bash
	run: \|
	set -eux
	echo "::group::Benchmark Results Summary"

	for RESULT_DIR in all_results/results-*/; do
	if [ -f "$RESULT_DIR/benchmark_results.json" ]; then
	echo ""
	echo "================================"
	echo "Results from: $(basename "$RESULT_DIR")"
	echo "================================"

	# Display benchmark results (mean performance)
	cat "$RESULT_DIR/benchmark_results.json" \| python -m json.tool

	# Display metadata
	if [ -f "$RESULT_DIR/metadata.json" ]; then
	echo ""
	echo "--- Metadata ---"
	cat "$RESULT_DIR/metadata.json" \| python -m json.tool
	fi
	echo ""
	fi
	done

	echo "::endgroup::"

	- name: Authenticate with AWS
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
	role-duration-seconds: 18000
	aws-region: us-east-1

	- name: Upload to S3
	shell: bash
	env:
	S3_BUCKET: gha-artifacts
	S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }}
	run: \|
	set -eux
	pip install awscli

	echo "Uploading benchmark results to S3..."
	aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \
	--exclude "*" \
	--include "*.json" \
	--include "*.log"

	echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/"

	# TODO: Future enhancement - parse results and upload to benchmark dashboard
	# Similar to apple-perf.yml's extract_benchmark_results.py approach
	# This would require:
	# 1. Parsing the benchmark output logs to extract metrics
	# 2. Converting to the v3 benchmark results format
	# 3. Uploading using pytorch/test-infra/.github/actions/upload-benchmark-results@main

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

add cuda benchmark ci #4

Workflow file

add cuda benchmark ci #4

Uh oh!

Jobs

Run details

Workflow file for this run