cuda-perf #26

Workflow file for this run

.github/workflows/cuda-perf.yml at 23e6116

	name: cuda-perf

	on:
	schedule:
	- cron: 0 8 * * * # 1am PST (8am UTC)
	pull_request:
	paths:
	- .github/workflows/cuda-perf.yml
	- .ci/scripts/cuda_benchmark.py
	- .ci/scripts/export_model_artifact.sh
	- .ci/scripts/test_model_e2e.sh
	push:
	branches:
	- main
	paths:
	- .github/workflows/cuda-perf.yml
	- .ci/scripts/cuda_benchmark.py
	- .ci/scripts/export_model_artifact.sh
	- .ci/scripts/test_model_e2e.sh
	workflow_dispatch:
	inputs:
	models:
	description: Models to be benchmarked (comma-separated HuggingFace model IDs)
	required: false
	type: string
	default: openai/whisper-small
	quantizations:
	description: Quantization types (comma-separated)
	required: false
	type: string
	default: non-quantized
	num_runs:
	description: Number of benchmark runs per model
	required: false
	type: string
	default: "50"
	random_model:
	description: Run a random model instead of all models
	required: false
	type: boolean
	default: false

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
	cancel-in-progress: true

	jobs:
	set-parameters:
	runs-on: ubuntu-22.04
	outputs:
	benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10'
	- name: Set parameters
	id: set-parameters
	shell: bash
	env:
	# All available models and quantizations
	ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it'
	ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
	NUM_RUNS: ${{ inputs.num_runs \|\| '50' }}
	RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' \|\| inputs.random_model \|\| 'false' }}
	run: \|
	set -eux

	MODELS="${{ inputs.models }}"
	QUANTIZATIONS="${{ inputs.quantizations }}"

	# For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization
	if [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
	# Split all models into array
	IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS"
	# Randomly select one model
	RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]}))
	MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}"
	echo "Randomly selected model for PR/push: $MODELS"
	elif [ -z "$MODELS" ]; then
	# Schedule event: use all models
	MODELS="$ALL_MODELS"
	fi

	if [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
	# Split all quantizations into array
	IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS"
	# Randomly select one quantization
	RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]}))
	QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}"
	echo "Randomly selected quantization for PR/push: $QUANTIZATIONS"
	elif [ -z "$QUANTIZATIONS" ]; then
	# Schedule event: use all quantizations
	QUANTIZATIONS="$ALL_QUANTIZATIONS"
	fi

	# Split models and quantizations into arrays
	IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
	IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"

	# If random model is requested (for main branch push), select one random model from the already selected models
	if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then
	RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]}))
	MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}"
	MODEL_ARRAY=("$MODELS")
	echo "Random model selected for main branch push: $MODELS"
	fi

	# Generate benchmark configs
	CONFIGS='{"include":['
	FIRST=true
	for MODEL in "${MODEL_ARRAY[@]}"; do
	for QUANT in "${QUANT_ARRAY[@]}"; do
	if [ "$FIRST" = true ]; then
	FIRST=false
	else
	CONFIGS+=','
	fi
	# Sanitize model name for use in artifact paths
	MODEL_SAFE=$(echo "$MODEL" \| sed 's/\//_/g')
	CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}"
	done
	done
	CONFIGS+=']}'

	echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT
	echo "Generated benchmark configs:"
	echo "$CONFIGS" \| python -m json.tool

	export-models:
	name: export-models
	needs: set-parameters
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	permissions:
	id-token: write
	contents: read
	secrets: inherit
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
	fail-fast: false
	with:
	timeout: 90
	secrets-env: EXECUTORCH_HF_TOKEN
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.6"
	use-custom-docker-registry: false
	submodules: recursive
	upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux
	echo "::group::Setup ExecuTorch"
	./install_executorch.sh
	echo "::endgroup::"

	echo "::group::Setup Huggingface"
	pip install -U "huggingface_hub[cli]<1.0" accelerate
	huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
	OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
	pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
	echo "::endgroup::"

	echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
	OUTPUT_DIR="model_artifacts"
	mkdir -p "$OUTPUT_DIR"

	bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR"

	# Move artifacts to RUNNER_ARTIFACT_DIR for upload
	mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/"
	ls -lah "${RUNNER_ARTIFACT_DIR}"
	echo "::endgroup::"

	benchmark-cuda:
	name: benchmark-cuda
	needs:
	- set-parameters
	- export-models
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	permissions:
	id-token: write
	contents: read
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
	fail-fast: false
	with:
	timeout: 90
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.6"
	use-custom-docker-registry: false
	submodules: recursive
	download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
	upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }}
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	script: \|
	set -eux
	echo "::group::Setup environment"
	./install_requirements.sh
	pip list
	echo "::endgroup::"

	echo "::group::Prepare model artifacts"
	mkdir -p model_artifacts
	cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte
	cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd

	# Copy additional files if they exist
	if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/
	fi
	if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
	fi
	# Copy tokenizer files
	for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
	if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
	cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/
	fi
	done

	ls -lah model_artifacts/
	echo "::endgroup::"

	echo "::group::Build runner"
	bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts
	echo "::endgroup::"

	echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs"
	export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH

	# Get GPU name using nvidia-smi
	GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader \| head -1)
	echo "Detected GPU: $GPU_NAME"

	# Get CUDA driver version
	CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader \| head -1)
	echo "CUDA Driver Version: $CUDA_DRIVER_VERSION"

	# Create results directory (separate from model artifacts)
	RESULTS_DIR="benchmark_results"
	mkdir -p "$RESULTS_DIR"

	# Determine model name and runner command based on model
	case "${{ matrix.model }}" in
	mistralai/Voxtral-Mini-3B-2507)
	RUNNER="cmake-out/examples/models/voxtral/voxtral_runner"
	PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte"
	TOKENIZER="model_artifacts/tekken.json"
	AUDIO="model_artifacts/poem.wav"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
	MODEL_NAME="voxtral_${{ matrix.quant }}"
	;;
	openai/whisper-*)
	RUNNER="cmake-out/examples/models/whisper/whisper_runner"
	PREPROCESSOR="model_artifacts/whisper_preprocessor.pte"
	AUDIO="model_artifacts/output.wav"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0"
	MODEL_NAME=$(echo "${{ matrix.model }}" \| sed 's/openai\///')_${{ matrix.quant }}
	;;
	google/gemma-3-4b-it)
	RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner"
	IMAGE="docs/source/_static/img/et-logo.png"
	RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
	MODEL_NAME="gemma3_${{ matrix.quant }}"
	;;
	*)
	echo "Error: Unsupported model '${{ matrix.model }}'"
	exit 1
	;;
	esac

	# Run benchmark using cuda_benchmark.py
	python .ci/scripts/cuda_benchmark.py \
	--runner_command "$RUNNER_CMD" \
	--model_name "$MODEL_NAME" \
	--num_runs "${{ matrix.num_runs }}" \
	--output_json "$RESULTS_DIR/benchmark_results.json" \
	--output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \
	--model "${{ matrix.model }}" \
	--quantization "${{ matrix.quant }}" \
	--git_sha "${{ github.sha }}" \
	--workflow_run_id "${{ github.run_id }}" \
	--workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
	--gpu_name "$GPU_NAME" \
	--cuda_driver_version "$CUDA_DRIVER_VERSION"

	# Save additional metadata
	cat > "$RESULTS_DIR/metadata.json" <<EOF
	{
	"model": "${{ matrix.model }}",
	"quantization": "${{ matrix.quant }}",
	"num_runs": ${{ matrix.num_runs }},
	"runner": "$RUNNER",
	"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
	"git_sha": "${{ github.sha }}",
	"workflow_run_id": "${{ github.run_id }}",
	"workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	}
	EOF

	# Only copy benchmark results to RUNNER_ARTIFACT_DIR for upload (not the entire model)
	# First, clean up the downloaded model artifacts from RUNNER_ARTIFACT_DIR
	rm -rf "${RUNNER_ARTIFACT_DIR}"/*

	# Then copy only the benchmark result JSON files
	cp "$RESULTS_DIR"/*.json "${RUNNER_ARTIFACT_DIR}/"
	echo "Benchmark results prepared for upload:"
	ls -lah "${RUNNER_ARTIFACT_DIR}"
	echo "::endgroup::"

	upload-benchmark-results:
	needs:
	- benchmark-cuda
	if: always()
	runs-on: ubuntu-22.04
	environment: upload-benchmark-results
	permissions:
	id-token: write
	contents: read
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: false

	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10'

	- name: Download all benchmark results
	uses: actions/download-artifact@v4
	with:
	pattern: results-*
	path: all_results/

	- name: Process and display results
	shell: bash
	run: \|
	set -eux
	echo "::group::Benchmark Results Summary"

	for RESULT_DIR in all_results/results-*/; do
	if [ -f "$RESULT_DIR/benchmark_results.json" ]; then
	echo ""
	echo "================================"
	echo "Results from: $(basename "$RESULT_DIR")"
	echo "================================"

	# Display benchmark results (mean performance)
	cat "$RESULT_DIR/benchmark_results.json" \| python -m json.tool

	# Display metadata
	if [ -f "$RESULT_DIR/metadata.json" ]; then
	echo ""
	echo "--- Metadata ---"
	cat "$RESULT_DIR/metadata.json" \| python -m json.tool
	fi
	echo ""
	fi
	done

	echo "::endgroup::"

	- name: Authenticate with AWS
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
	role-duration-seconds: 18000
	aws-region: us-east-1

	- name: Upload to S3
	shell: bash
	env:
	S3_BUCKET: gha-artifacts
	S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }}
	run: \|
	set -eux
	pip install awscli

	echo "Uploading benchmark results to S3..."
	aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \
	--exclude "*" \
	--include "*.json" \
	--include "*.log"

	echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/"

	- name: Prepare v3 results for dashboard upload
	shell: bash
	run: \|
	set -eux
	echo "::group::Prepare v3 results"

	mkdir -p benchmark-results/v3

	# Collect all v3 results into a single directory
	for RESULT_DIR in all_results/results-*/; do
	if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then
	# Generate unique filename based on directory name
	FILENAME=$(basename "$RESULT_DIR")
	cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json"
	echo "✓ Copied $FILENAME v3 results"
	fi
	done

	echo "V3 results prepared:"
	ls -lah benchmark-results/v3/
	echo "::endgroup::"

	- name: Upload benchmark results to dashboard
	uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
	with:
	benchmark-results-dir: benchmark-results/v3
	dry-run: false
	schema-version: v3
	github-token: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

cuda-perf #26

Workflow file

cuda-perf #26

Uh oh!

Jobs

Run details

Workflow file for this run