Skip to content

Add multi-gpu test job #1

Add multi-gpu test job

Add multi-gpu test job #1

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

Check failure on line 1 in .github/workflows/build-test-distribute-flavor.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/build-test-distribute-flavor.yml

Invalid workflow file

(Line: 31, Col: 9): Required property is missing: type
# SPDX-License-Identifier: Apache-2.0
name: Build, Test, and Copy Framework Image
on:
workflow_call:
inputs:
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
type: string
target:
description: 'Target stage for Docker rendering'
required: true
type: string
platform:
description: 'Platform to build (amd64 or arm64)'
required: true
type: string
cuda_version:
description: 'CUDA version to build (e.g., 12.9, 13.0)'
required: true
type: string
run_tests:
description: 'Whether to run pytest'
required: false
type: boolean
default: true
run_multi_gpu_tests:
description: 'Whether to run multi-gpu tests'
required: false
default: false
copy_to_acr:
description: 'Whether to copy images to ACR'
required: false
type: boolean
default: true
builder_name:
description: 'Buildkit builder name'
required: true
type: string
extra_tags:
description: 'Additional tags (newline-separated, -$platform suffix auto-appended)'
required: false
type: string
default: ''
build_image:
description: 'Whether to build image'
required: false
type: boolean
default: true
no_cache:
description: 'Disable Docker build cache'
required: false
type: boolean
default: false
push_image:
description: 'Push image to registry'
required: false
type: boolean
default: true
no_load:
description: 'Do not load the image into docker (you must have dind installed if you want to load the image)'
required: false
type: boolean
default: true
show_summary:
description: 'Show summary'
required: false
type: boolean
default: false
build_timeout_minutes:
description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
test_gpu_timeout_minutes:
description: 'Timeout in minutes for the GPU test step'
required: false
type: number
default: 30
test_cpu_timeout_minutes:
description: 'Timeout in minutes for the CPU test step'
required: false
type: number
default: 10
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 5
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
CI_TOKEN:
required: false
SCCACHE_S3_BUCKET:
required: false
AWS_ACCESS_KEY_ID:
required: false
AWS_SECRET_ACCESS_KEY:
required: false
HF_TOKEN:
required: false
outputs:
image_tag:
description: 'Image tag in ACR'
value: ${{ jobs.copy-to-acr.outputs.target_tag_plain }}-${{ inputs.platform }}
jobs:
# ============================================================================
# BUILD
# ============================================================================
build:
if: inputs.build_image
name: Build ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: prod-builder-v2
outputs:
target_tag_plain: ${{ steps.calculate-target-tag.outputs.target_tag_plain }}
env:
FRAMEWORK: ${{ inputs.framework }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #v6.2.0
with:
python-version: '3.12'
pip-install: jinja2 pyyaml
- name: Calculate extra tags with platform suffix # will get redundant upon multi arch builds support
id: extra-tags
shell: bash
env:
EXTRA_TAGS: ${{ inputs.extra_tags }}
CUDA_VERSION: ${{ inputs.cuda_version }}
run: |
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
if [ -n "$EXTRA_TAGS" ]; then
RESULT=""
while IFS= read -r tag; do
if [ -n "$tag" ]; then
RESULT+="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${tag}-cuda${CUDA_VERSION_MAJOR}-${{ inputs.platform }}"$'\n'
fi
done <<< "$EXTRA_TAGS"
echo "tags<<EOF" >> $GITHUB_OUTPUT
echo "$RESULT" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
else
echo "tags=" >> $GITHUB_OUTPUT
fi
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
TARGET_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}"
DEFAULT_TARGET_IMAGE_URI="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG_PLAIN}-cuda${CUDA_VERSION}-${{ inputs.platform }}"
echo "default_target_image_uri=${DEFAULT_TARGET_IMAGE_URI}" >> $GITHUB_OUTPUT
echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ inputs.builder_name }}
flavor: ${{ inputs.framework }}
arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
- name: Print Build Container inputs
run: |
echo "=== Build Container Inputs ==="
echo "image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}"
echo "framework: ${{ inputs.framework }}"
echo "target: runtime"
echo "platform: ${{ inputs.platform }}"
echo "cuda_version: ${{ inputs.cuda_version }}"
echo "no_cache: ${{ inputs.no_cache }}"
echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}"
echo "push_image: ${{ inputs.push_image }}"
echo "no_load: ${{ inputs.no_load }}"
- name: Generate Dockerfile
shell: bash
run: |
echo "Generating Dockerfile for target: ${{ inputs.target }} and framework: ${{ inputs.framework }}"
python ./container/render.py \
--target=${{ inputs.target }} \
--framework=${{ inputs.framework }} \
--platform=${{ inputs.platform }} \
--cuda-version=${{ inputs.cuda_version }} \
--show-result \
--output-short-filename
- name: Build Container
id: build-image
timeout-minutes: ${{ inputs.build_timeout_minutes }}
uses: ./.github/actions/docker-remote-build
with:
image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}
framework: ${{ inputs.framework }}
target: ${{ inputs.target }}
platform: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
no_cache: ${{ inputs.no_cache }}
extra_tags: ${{ steps.extra-tags.outputs.tags }}
push_image: ${{ inputs.push_image }}
no_load: ${{ inputs.no_load }}
- name: Show summary
shell: bash
if: ${{ inputs.push_image && inputs.show_summary }}
run: |
echo "### 🐳 ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} Default Image" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY
echo "| \`${{ steps.calculate-target-tag.outputs.default_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY
# ============================================================================
# TEST
# ============================================================================
test:
if: inputs.run_tests && inputs.build_image
needs: [build]
name: Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }}
env:
FRAMEWORK: ${{ inputs.framework }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull relevant images
shell: bash
run: |
start_time=$(date +%s)
docker pull ${{ steps.calculate-target-tag.outputs.test_image }}
docker pull quay.io/minio/minio
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s"
- name: Run Sanity Check on Runtime Image
shell: bash
run: |
echo "Running sanity check on image: ${{ steps.calculate-target-tag.outputs.test_image }}"
# Run the sanity check script inside the container
# The script is located in /workspace/deploy/sanity_check.py in runtime containers
export WORKSPACE=/workspace
set +e
docker run --rm "${{ steps.calculate-target-tag.outputs.test_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check
SANITY_CHECK_EXIT_CODE=$?
set -e
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
exit ${SANITY_CHECK_EXIT_CODE}
else
echo "✅ Sanity check passed"
fi
# Run CPU-only tests first (parallelized for speed)
# These are unit tests marked with gpu_0 that don't require GPU hardware
- name: Run CPU-only tests (parallelized)
timeout-minutes: ${{ inputs.test_cpu_timeout_minutes }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ format('pre_merge and {0} and gpu_0', inputs.framework) }}
framework: ${{ inputs.framework }}
test_type: "pre_merge_cpu"
platform_arch: ${{ inputs.platform }}
enable_mypy: 'true'
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'auto'
dind_as_sidecar: 'true'
# Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_1 that require GPU hardware
- name: Run GPU tests (sequential)
timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }}
if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ format('pre_merge and {0} and gpu_1', inputs.framework) }}
framework: ${{ inputs.framework }}
test_type: "pre_merge_gpu"
platform_arch: ${{ inputs.platform }}
enable_mypy: 'false' # already covered by CPU tests
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'true'
# ============================================================================
# MULTI-GPU TESTS
# ============================================================================
multi-gpu-test:
# Multi-GPU support limited to AMD64 only
if: |
inputs.run_multi_gpu_tests &&
inputs.build_image &&
( inputs.platform != 'arm64' )
needs: [build]
name: Multi-gpu Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: prod-tester-amd-gpu-4-v1
env:
FRAMEWORK: ${{ inputs.framework }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull relevant images
shell: bash
run: |
start_time=$(date +%s)
docker pull ${{ steps.calculate-target-tag.outputs.test_image }}
docker pull quay.io/minio/minio
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s"
# Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware
- name: Run GPU tests (sequential)
timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: 'gpu_2 or gpu_4'
framework: ${{ inputs.framework }}
test_type: "pre_merge_gpu"
platform_arch: ${{ inputs.platform }}
enable_mypy: 'false' # already covered by CPU tests
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'true'
# ============================================================================
# COPY TO ACR
# ============================================================================
copy-to-acr:
needs: [build, test]
# Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped)
if: |
always() &&
inputs.copy_to_acr &&
needs.build.result == 'success' &&
(needs.test.result == 'success' || needs.test.result == 'skipped')
name: copy ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: prod-default-small-v2
outputs:
target_tag_plain: ${{ needs.build.outputs.target_tag_plain }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Copy image to target registry
timeout-minutes: ${{ inputs.copy_timeout_minutes }}
uses: ./.github/actions/skopeo-copy
with:
source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
source_image: ai-dynamo/dynamo
source_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }}
target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_image: ai-dynamo/dynamo
target_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }}
source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}