.github/workflows/build-test-distribute-flavor.yml

Add multi-gpu test job #1

Workflow file for this run

.github/workflows/build-test-distribute-flavor.yml at d9192b7

	# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Check failure on line 1 in .github/workflows/build-test-distribute-flavor.yml View workflow run for this annotation GitHub Actions / .github/workflows/build-test-distribute-flavor.yml Invalid workflow file `(Line: 31, Col: 9): Required property is missing: type`
	# SPDX-License-Identifier: Apache-2.0

	name: Build, Test, and Copy Framework Image

	on:
	workflow_call:
	inputs:
	framework:
	description: 'Framework name (vllm, sglang, trtllm)'
	required: true
	type: string
	target:
	description: 'Target stage for Docker rendering'
	required: true
	type: string
	platform:
	description: 'Platform to build (amd64 or arm64)'
	required: true
	type: string
	cuda_version:
	description: 'CUDA version to build (e.g., 12.9, 13.0)'
	required: true
	type: string
	run_tests:
	description: 'Whether to run pytest'
	required: false
	type: boolean
	default: true
	run_multi_gpu_tests:
	description: 'Whether to run multi-gpu tests'
	required: false
	default: false
	copy_to_acr:
	description: 'Whether to copy images to ACR'
	required: false
	type: boolean
	default: true
	builder_name:
	description: 'Buildkit builder name'
	required: true
	type: string
	extra_tags:
	description: 'Additional tags (newline-separated, -$platform suffix auto-appended)'
	required: false
	type: string
	default: ''
	build_image:
	description: 'Whether to build image'
	required: false
	type: boolean
	default: true
	no_cache:
	description: 'Disable Docker build cache'
	required: false
	type: boolean
	default: false
	push_image:
	description: 'Push image to registry'
	required: false
	type: boolean
	default: true
	no_load:
	description: 'Do not load the image into docker (you must have dind installed if you want to load the image)'
	required: false
	type: boolean
	default: true
	show_summary:
	description: 'Show summary'
	required: false
	type: boolean
	default: false
	build_timeout_minutes:
	description: 'Timeout in minutes for the build step'
	required: false
	type: number
	default: 60
	test_gpu_timeout_minutes:
	description: 'Timeout in minutes for the GPU test step'
	required: false
	type: number
	default: 30
	test_cpu_timeout_minutes:
	description: 'Timeout in minutes for the CPU test step'
	required: false
	type: number
	default: 10
	copy_timeout_minutes:
	description: 'Timeout in minutes for the copy to ACR step'
	required: false
	type: number
	default: 5
	secrets:
	AWS_DEFAULT_REGION:
	required: true
	AWS_ACCOUNT_ID:
	required: true
	AZURE_ACR_HOSTNAME:
	required: true
	AZURE_ACR_USER:
	required: true
	AZURE_ACR_PASSWORD:
	required: true
	CI_TOKEN:
	required: false
	SCCACHE_S3_BUCKET:
	required: false
	AWS_ACCESS_KEY_ID:
	required: false
	AWS_SECRET_ACCESS_KEY:
	required: false
	HF_TOKEN:
	required: false
	outputs:
	image_tag:
	description: 'Image tag in ACR'
	value: ${{ jobs.copy-to-acr.outputs.target_tag_plain }}-${{ inputs.platform }}

	jobs:
	# ============================================================================
	# BUILD
	# ============================================================================
	build:
	if: inputs.build_image
	name: Build ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
	runs-on: prod-builder-v2
	outputs:
	target_tag_plain: ${{ steps.calculate-target-tag.outputs.target_tag_plain }}
	env:
	FRAMEWORK: ${{ inputs.framework }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	with:
	lfs: true
	- name: Set up Python
	uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #v6.2.0
	with:
	python-version: '3.12'
	pip-install: jinja2 pyyaml
	- name: Calculate extra tags with platform suffix # will get redundant upon multi arch builds support
	id: extra-tags
	shell: bash
	env:
	EXTRA_TAGS: ${{ inputs.extra_tags }}
	CUDA_VERSION: ${{ inputs.cuda_version }}
	run: \|
	CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
	if [ -n "$EXTRA_TAGS" ]; then
	RESULT=""
	while IFS= read -r tag; do
	if [ -n "$tag" ]; then
	RESULT+="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${tag}-cuda${CUDA_VERSION_MAJOR}-${{ inputs.platform }}"$'\n'
	fi
	done <<< "$EXTRA_TAGS"
	echo "tags<<EOF" >> $GITHUB_OUTPUT
	echo "$RESULT" >> $GITHUB_OUTPUT
	echo "EOF" >> $GITHUB_OUTPUT
	else
	echo "tags=" >> $GITHUB_OUTPUT
	fi
	- name: Docker Login
	uses: ./.github/actions/docker-login
	with:
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
	- name: Calculate target tag
	id: calculate-target-tag
	shell: bash
	run: \|
	CUDA_VERSION_RAW=${{ inputs.cuda_version }}
	CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
	TARGET_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}"
	DEFAULT_TARGET_IMAGE_URI="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG_PLAIN}-cuda${CUDA_VERSION}-${{ inputs.platform }}"
	echo "default_target_image_uri=${DEFAULT_TARGET_IMAGE_URI}" >> $GITHUB_OUTPUT
	echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT
	echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
	- name: Initialize Dynamo Builder
	uses: ./.github/actions/init-dynamo-builder
	with:
	builder_name: ${{ inputs.builder_name }}
	flavor: ${{ inputs.framework }}
	arch: ${{ inputs.platform }}
	cuda_version: ${{ inputs.cuda_version }}
	- name: Print Build Container inputs
	run: \|
	echo "=== Build Container Inputs ==="
	echo "image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}"
	echo "framework: ${{ inputs.framework }}"
	echo "target: runtime"
	echo "platform: ${{ inputs.platform }}"
	echo "cuda_version: ${{ inputs.cuda_version }}"
	echo "no_cache: ${{ inputs.no_cache }}"
	echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}"
	echo "push_image: ${{ inputs.push_image }}"
	echo "no_load: ${{ inputs.no_load }}"
	- name: Generate Dockerfile
	shell: bash
	run: \|
	echo "Generating Dockerfile for target: ${{ inputs.target }} and framework: ${{ inputs.framework }}"
	python ./container/render.py \
	--target=${{ inputs.target }} \
	--framework=${{ inputs.framework }} \
	--platform=${{ inputs.platform }} \
	--cuda-version=${{ inputs.cuda_version }} \
	--show-result \
	--output-short-filename
	- name: Build Container
	id: build-image
	timeout-minutes: ${{ inputs.build_timeout_minutes }}
	uses: ./.github/actions/docker-remote-build
	with:
	image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}
	framework: ${{ inputs.framework }}
	target: ${{ inputs.target }}
	platform: ${{ inputs.platform }}
	cuda_version: ${{ inputs.cuda_version }}
	ci_token: ${{ secrets.CI_TOKEN }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	no_cache: ${{ inputs.no_cache }}
	extra_tags: ${{ steps.extra-tags.outputs.tags }}
	push_image: ${{ inputs.push_image }}
	no_load: ${{ inputs.no_load }}
	- name: Show summary
	shell: bash
	if: ${{ inputs.push_image && inputs.show_summary }}
	run: \|
	echo "### 🐳 ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} Default Image" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "\| Image URI \|" >> $GITHUB_STEP_SUMMARY
	echo "\|-----\|" >> $GITHUB_STEP_SUMMARY
	echo "\| \`${{ steps.calculate-target-tag.outputs.default_target_image_uri }}\` \|" >> $GITHUB_STEP_SUMMARY


	# ============================================================================
	# TEST
	# ============================================================================
	test:
	if: inputs.run_tests && inputs.build_image
	needs: [build]
	name: Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
	runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' \|\| 'prod-tester-arm-v1' }}
	env:
	FRAMEWORK: ${{ inputs.framework }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Calculate target tag
	id: calculate-target-tag
	shell: bash
	run: \|
	CUDA_VERSION_RAW=${{ inputs.cuda_version }}
	CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
	echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
	TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
	echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
	- name: Docker Login
	uses: ./.github/actions/docker-login
	with:
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
	- name: Pull relevant images
	shell: bash
	run: \|
	start_time=$(date +%s)
	docker pull ${{ steps.calculate-target-tag.outputs.test_image }}
	docker pull quay.io/minio/minio
	end_time=$(date +%s)
	duration=$((end_time - start_time))
	echo "⏱️ Image pull duration: ${duration}s"

	- name: Run Sanity Check on Runtime Image
	shell: bash
	run: \|
	echo "Running sanity check on image: ${{ steps.calculate-target-tag.outputs.test_image }}"

	# Run the sanity check script inside the container
	# The script is located in /workspace/deploy/sanity_check.py in runtime containers
	export WORKSPACE=/workspace

	set +e
	docker run --rm "${{ steps.calculate-target-tag.outputs.test_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check
	SANITY_CHECK_EXIT_CODE=$?
	set -e
	if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
	echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
	exit ${SANITY_CHECK_EXIT_CODE}
	else
	echo "✅ Sanity check passed"
	fi
	# Run CPU-only tests first (parallelized for speed)
	# These are unit tests marked with gpu_0 that don't require GPU hardware
	- name: Run CPU-only tests (parallelized)
	timeout-minutes: ${{ inputs.test_cpu_timeout_minutes }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
	pytest_marks: ${{ format('pre_merge and {0} and gpu_0', inputs.framework) }}
	framework: ${{ inputs.framework }}
	test_type: "pre_merge_cpu"
	platform_arch: ${{ inputs.platform }}
	enable_mypy: 'true'
	hf_token: ${{ secrets.HF_TOKEN }}
	parallel_mode: 'auto'
	dind_as_sidecar: 'true'

	# Run GPU tests sequentially (only on amd64 runners with GPU)
	# These are e2e tests marked with gpu_1 that require GPU hardware
	- name: Run GPU tests (sequential)
	timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }}
	if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
	pytest_marks: ${{ format('pre_merge and {0} and gpu_1', inputs.framework) }}
	framework: ${{ inputs.framework }}
	test_type: "pre_merge_gpu"
	platform_arch: ${{ inputs.platform }}
	enable_mypy: 'false' # already covered by CPU tests
	hf_token: ${{ secrets.HF_TOKEN }}
	parallel_mode: 'none'
	dind_as_sidecar: 'true'

	# ============================================================================
	# MULTI-GPU TESTS
	# ============================================================================

	multi-gpu-test:
	# Multi-GPU support limited to AMD64 only
	if: \|
	inputs.run_multi_gpu_tests &&
	inputs.build_image &&
	( inputs.platform != 'arm64' )
	needs: [build]
	name: Multi-gpu Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
	runs-on: prod-tester-amd-gpu-4-v1
	env:
	FRAMEWORK: ${{ inputs.framework }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Calculate target tag
	id: calculate-target-tag
	shell: bash
	run: \|
	CUDA_VERSION_RAW=${{ inputs.cuda_version }}
	CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
	echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
	TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
	echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
	- name: Docker Login
	uses: ./.github/actions/docker-login
	with:
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
	- name: Pull relevant images
	shell: bash
	run: \|
	start_time=$(date +%s)
	docker pull ${{ steps.calculate-target-tag.outputs.test_image }}
	docker pull quay.io/minio/minio
	end_time=$(date +%s)
	duration=$((end_time - start_time))
	echo "⏱️ Image pull duration: ${duration}s"
	# Run GPU tests sequentially (only on amd64 runners with GPU)
	# These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware
	- name: Run GPU tests (sequential)
	timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
	pytest_marks: 'gpu_2 or gpu_4'
	framework: ${{ inputs.framework }}
	test_type: "pre_merge_gpu"
	platform_arch: ${{ inputs.platform }}
	enable_mypy: 'false' # already covered by CPU tests
	hf_token: ${{ secrets.HF_TOKEN }}
	parallel_mode: 'none'
	dind_as_sidecar: 'true'

	# ============================================================================
	# COPY TO ACR
	# ============================================================================
	copy-to-acr:
	needs: [build, test]
	# Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped)
	if: \|
	always() &&
	inputs.copy_to_acr &&
	needs.build.result == 'success' &&
	(needs.test.result == 'success' \|\| needs.test.result == 'skipped')
	name: copy ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
	runs-on: prod-default-small-v2
	outputs:
	target_tag_plain: ${{ needs.build.outputs.target_tag_plain }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0

	- name: Calculate target tag
	id: calculate-target-tag
	shell: bash
	run: \|
	CUDA_VERSION_RAW=${{ inputs.cuda_version }}
	CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
	echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
	TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
	echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT

	- name: Copy image to target registry
	timeout-minutes: ${{ inputs.copy_timeout_minutes }}
	uses: ./.github/actions/skopeo-copy
	with:
	source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	source_image: ai-dynamo/dynamo
	source_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }}
	target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
	target_image: ai-dynamo/dynamo
	target_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }}
	source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add multi-gpu test job #1

Workflow file

Add multi-gpu test job #1

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/build-test-distribute-flavor.yml