Add multi-gpu test job #1
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| name: Build, Test, and Copy Framework Image | ||
| on: | ||
| workflow_call: | ||
| inputs: | ||
| framework: | ||
| description: 'Framework name (vllm, sglang, trtllm)' | ||
| required: true | ||
| type: string | ||
| target: | ||
| description: 'Target stage for Docker rendering' | ||
| required: true | ||
| type: string | ||
| platform: | ||
| description: 'Platform to build (amd64 or arm64)' | ||
| required: true | ||
| type: string | ||
| cuda_version: | ||
| description: 'CUDA version to build (e.g., 12.9, 13.0)' | ||
| required: true | ||
| type: string | ||
| run_tests: | ||
| description: 'Whether to run pytest' | ||
| required: false | ||
| type: boolean | ||
| default: true | ||
| run_multi_gpu_tests: | ||
| description: 'Whether to run multi-gpu tests' | ||
| required: false | ||
| default: false | ||
| copy_to_acr: | ||
| description: 'Whether to copy images to ACR' | ||
| required: false | ||
| type: boolean | ||
| default: true | ||
| builder_name: | ||
| description: 'Buildkit builder name' | ||
| required: true | ||
| type: string | ||
| extra_tags: | ||
| description: 'Additional tags (newline-separated, -$platform suffix auto-appended)' | ||
| required: false | ||
| type: string | ||
| default: '' | ||
| build_image: | ||
| description: 'Whether to build image' | ||
| required: false | ||
| type: boolean | ||
| default: true | ||
| no_cache: | ||
| description: 'Disable Docker build cache' | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| push_image: | ||
| description: 'Push image to registry' | ||
| required: false | ||
| type: boolean | ||
| default: true | ||
| no_load: | ||
| description: 'Do not load the image into docker (you must have dind installed if you want to load the image)' | ||
| required: false | ||
| type: boolean | ||
| default: true | ||
| show_summary: | ||
| description: 'Show summary' | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| build_timeout_minutes: | ||
| description: 'Timeout in minutes for the build step' | ||
| required: false | ||
| type: number | ||
| default: 60 | ||
| test_gpu_timeout_minutes: | ||
| description: 'Timeout in minutes for the GPU test step' | ||
| required: false | ||
| type: number | ||
| default: 30 | ||
| test_cpu_timeout_minutes: | ||
| description: 'Timeout in minutes for the CPU test step' | ||
| required: false | ||
| type: number | ||
| default: 10 | ||
| copy_timeout_minutes: | ||
| description: 'Timeout in minutes for the copy to ACR step' | ||
| required: false | ||
| type: number | ||
| default: 5 | ||
| secrets: | ||
| AWS_DEFAULT_REGION: | ||
| required: true | ||
| AWS_ACCOUNT_ID: | ||
| required: true | ||
| AZURE_ACR_HOSTNAME: | ||
| required: true | ||
| AZURE_ACR_USER: | ||
| required: true | ||
| AZURE_ACR_PASSWORD: | ||
| required: true | ||
| CI_TOKEN: | ||
| required: false | ||
| SCCACHE_S3_BUCKET: | ||
| required: false | ||
| AWS_ACCESS_KEY_ID: | ||
| required: false | ||
| AWS_SECRET_ACCESS_KEY: | ||
| required: false | ||
| HF_TOKEN: | ||
| required: false | ||
| outputs: | ||
| image_tag: | ||
| description: 'Image tag in ACR' | ||
| value: ${{ jobs.copy-to-acr.outputs.target_tag_plain }}-${{ inputs.platform }} | ||
| jobs: | ||
| # ============================================================================ | ||
| # BUILD | ||
| # ============================================================================ | ||
| build: | ||
| if: inputs.build_image | ||
| name: Build ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} | ||
| runs-on: prod-builder-v2 | ||
| outputs: | ||
| target_tag_plain: ${{ steps.calculate-target-tag.outputs.target_tag_plain }} | ||
| env: | ||
| FRAMEWORK: ${{ inputs.framework }} | ||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 | ||
| with: | ||
| lfs: true | ||
| - name: Set up Python | ||
| uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 #v6.2.0 | ||
| with: | ||
| python-version: '3.12' | ||
| pip-install: jinja2 pyyaml | ||
| - name: Calculate extra tags with platform suffix # will get redundant upon multi arch builds support | ||
| id: extra-tags | ||
| shell: bash | ||
| env: | ||
| EXTRA_TAGS: ${{ inputs.extra_tags }} | ||
| CUDA_VERSION: ${{ inputs.cuda_version }} | ||
| run: | | ||
| CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} | ||
| if [ -n "$EXTRA_TAGS" ]; then | ||
| RESULT="" | ||
| while IFS= read -r tag; do | ||
| if [ -n "$tag" ]; then | ||
| RESULT+="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${tag}-cuda${CUDA_VERSION_MAJOR}-${{ inputs.platform }}"$'\n' | ||
| fi | ||
| done <<< "$EXTRA_TAGS" | ||
| echo "tags<<EOF" >> $GITHUB_OUTPUT | ||
| echo "$RESULT" >> $GITHUB_OUTPUT | ||
| echo "EOF" >> $GITHUB_OUTPUT | ||
| else | ||
| echo "tags=" >> $GITHUB_OUTPUT | ||
| fi | ||
| - name: Docker Login | ||
| uses: ./.github/actions/docker-login | ||
| with: | ||
| aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} | ||
| aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} | ||
| azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} | ||
| azure_acr_user: ${{ secrets.AZURE_ACR_USER }} | ||
| azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} | ||
| - name: Calculate target tag | ||
| id: calculate-target-tag | ||
| shell: bash | ||
| run: | | ||
| CUDA_VERSION_RAW=${{ inputs.cuda_version }} | ||
| CUDA_VERSION=${CUDA_VERSION_RAW%%.*} | ||
| TARGET_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}" | ||
| DEFAULT_TARGET_IMAGE_URI="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG_PLAIN}-cuda${CUDA_VERSION}-${{ inputs.platform }}" | ||
| echo "default_target_image_uri=${DEFAULT_TARGET_IMAGE_URI}" >> $GITHUB_OUTPUT | ||
| echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT | ||
| echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT | ||
| - name: Initialize Dynamo Builder | ||
| uses: ./.github/actions/init-dynamo-builder | ||
| with: | ||
| builder_name: ${{ inputs.builder_name }} | ||
| flavor: ${{ inputs.framework }} | ||
| arch: ${{ inputs.platform }} | ||
| cuda_version: ${{ inputs.cuda_version }} | ||
| - name: Print Build Container inputs | ||
| run: | | ||
| echo "=== Build Container Inputs ===" | ||
| echo "image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}" | ||
| echo "framework: ${{ inputs.framework }}" | ||
| echo "target: runtime" | ||
| echo "platform: ${{ inputs.platform }}" | ||
| echo "cuda_version: ${{ inputs.cuda_version }}" | ||
| echo "no_cache: ${{ inputs.no_cache }}" | ||
| echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}" | ||
| echo "push_image: ${{ inputs.push_image }}" | ||
| echo "no_load: ${{ inputs.no_load }}" | ||
| - name: Generate Dockerfile | ||
| shell: bash | ||
| run: | | ||
| echo "Generating Dockerfile for target: ${{ inputs.target }} and framework: ${{ inputs.framework }}" | ||
| python ./container/render.py \ | ||
| --target=${{ inputs.target }} \ | ||
| --framework=${{ inputs.framework }} \ | ||
| --platform=${{ inputs.platform }} \ | ||
| --cuda-version=${{ inputs.cuda_version }} \ | ||
| --show-result \ | ||
| --output-short-filename | ||
| - name: Build Container | ||
| id: build-image | ||
| timeout-minutes: ${{ inputs.build_timeout_minutes }} | ||
| uses: ./.github/actions/docker-remote-build | ||
| with: | ||
| image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }} | ||
| framework: ${{ inputs.framework }} | ||
| target: ${{ inputs.target }} | ||
| platform: ${{ inputs.platform }} | ||
| cuda_version: ${{ inputs.cuda_version }} | ||
| ci_token: ${{ secrets.CI_TOKEN }} | ||
| aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} | ||
| sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} | ||
| aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} | ||
| aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} | ||
| aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
| no_cache: ${{ inputs.no_cache }} | ||
| extra_tags: ${{ steps.extra-tags.outputs.tags }} | ||
| push_image: ${{ inputs.push_image }} | ||
| no_load: ${{ inputs.no_load }} | ||
| - name: Show summary | ||
| shell: bash | ||
| if: ${{ inputs.push_image && inputs.show_summary }} | ||
| run: | | ||
| echo "### 🐳 ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} Default Image" >> $GITHUB_STEP_SUMMARY | ||
| echo "" >> $GITHUB_STEP_SUMMARY | ||
| echo "| Image URI |" >> $GITHUB_STEP_SUMMARY | ||
| echo "|-----|" >> $GITHUB_STEP_SUMMARY | ||
| echo "| \`${{ steps.calculate-target-tag.outputs.default_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY | ||
| # ============================================================================ | ||
| # TEST | ||
| # ============================================================================ | ||
| test: | ||
| if: inputs.run_tests && inputs.build_image | ||
| needs: [build] | ||
| name: Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} | ||
| runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }} | ||
| env: | ||
| FRAMEWORK: ${{ inputs.framework }} | ||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 | ||
| - name: Calculate target tag | ||
| id: calculate-target-tag | ||
| shell: bash | ||
| run: | | ||
| CUDA_VERSION_RAW=${{ inputs.cuda_version }} | ||
| CUDA_VERSION=${CUDA_VERSION_RAW%%.*} | ||
| echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT | ||
| TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} | ||
| echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT | ||
| - name: Docker Login | ||
| uses: ./.github/actions/docker-login | ||
| with: | ||
| aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} | ||
| aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} | ||
| azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} | ||
| azure_acr_user: ${{ secrets.AZURE_ACR_USER }} | ||
| azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} | ||
| - name: Pull relevant images | ||
| shell: bash | ||
| run: | | ||
| start_time=$(date +%s) | ||
| docker pull ${{ steps.calculate-target-tag.outputs.test_image }} | ||
| docker pull quay.io/minio/minio | ||
| end_time=$(date +%s) | ||
| duration=$((end_time - start_time)) | ||
| echo "⏱️ Image pull duration: ${duration}s" | ||
| - name: Run Sanity Check on Runtime Image | ||
| shell: bash | ||
| run: | | ||
| echo "Running sanity check on image: ${{ steps.calculate-target-tag.outputs.test_image }}" | ||
| # Run the sanity check script inside the container | ||
| # The script is located in /workspace/deploy/sanity_check.py in runtime containers | ||
| export WORKSPACE=/workspace | ||
| set +e | ||
| docker run --rm "${{ steps.calculate-target-tag.outputs.test_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check | ||
| SANITY_CHECK_EXIT_CODE=$? | ||
| set -e | ||
| if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then | ||
| echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed" | ||
| exit ${SANITY_CHECK_EXIT_CODE} | ||
| else | ||
| echo "✅ Sanity check passed" | ||
| fi | ||
| # Run CPU-only tests first (parallelized for speed) | ||
| # These are unit tests marked with gpu_0 that don't require GPU hardware | ||
| - name: Run CPU-only tests (parallelized) | ||
| timeout-minutes: ${{ inputs.test_cpu_timeout_minutes }} | ||
| uses: ./.github/actions/pytest | ||
| with: | ||
| image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} | ||
| pytest_marks: ${{ format('pre_merge and {0} and gpu_0', inputs.framework) }} | ||
| framework: ${{ inputs.framework }} | ||
| test_type: "pre_merge_cpu" | ||
| platform_arch: ${{ inputs.platform }} | ||
| enable_mypy: 'true' | ||
| hf_token: ${{ secrets.HF_TOKEN }} | ||
| parallel_mode: 'auto' | ||
| dind_as_sidecar: 'true' | ||
| # Run GPU tests sequentially (only on amd64 runners with GPU) | ||
| # These are e2e tests marked with gpu_1 that require GPU hardware | ||
| - name: Run GPU tests (sequential) | ||
| timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }} | ||
| if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64 | ||
| uses: ./.github/actions/pytest | ||
| with: | ||
| image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} | ||
| pytest_marks: ${{ format('pre_merge and {0} and gpu_1', inputs.framework) }} | ||
| framework: ${{ inputs.framework }} | ||
| test_type: "pre_merge_gpu" | ||
| platform_arch: ${{ inputs.platform }} | ||
| enable_mypy: 'false' # already covered by CPU tests | ||
| hf_token: ${{ secrets.HF_TOKEN }} | ||
| parallel_mode: 'none' | ||
| dind_as_sidecar: 'true' | ||
| # ============================================================================ | ||
| # MULTI-GPU TESTS | ||
| # ============================================================================ | ||
| multi-gpu-test: | ||
| # Multi-GPU support limited to AMD64 only | ||
| if: | | ||
| inputs.run_multi_gpu_tests && | ||
| inputs.build_image && | ||
| ( inputs.platform != 'arm64' ) | ||
| needs: [build] | ||
| name: Multi-gpu Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} | ||
| runs-on: prod-tester-amd-gpu-4-v1 | ||
| env: | ||
| FRAMEWORK: ${{ inputs.framework }} | ||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 | ||
| - name: Calculate target tag | ||
| id: calculate-target-tag | ||
| shell: bash | ||
| run: | | ||
| CUDA_VERSION_RAW=${{ inputs.cuda_version }} | ||
| CUDA_VERSION=${CUDA_VERSION_RAW%%.*} | ||
| echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT | ||
| TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} | ||
| echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT | ||
| - name: Docker Login | ||
| uses: ./.github/actions/docker-login | ||
| with: | ||
| aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} | ||
| aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} | ||
| azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} | ||
| azure_acr_user: ${{ secrets.AZURE_ACR_USER }} | ||
| azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} | ||
| - name: Pull relevant images | ||
| shell: bash | ||
| run: | | ||
| start_time=$(date +%s) | ||
| docker pull ${{ steps.calculate-target-tag.outputs.test_image }} | ||
| docker pull quay.io/minio/minio | ||
| end_time=$(date +%s) | ||
| duration=$((end_time - start_time)) | ||
| echo "⏱️ Image pull duration: ${duration}s" | ||
| # Run GPU tests sequentially (only on amd64 runners with GPU) | ||
| # These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware | ||
| - name: Run GPU tests (sequential) | ||
| timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }} | ||
| uses: ./.github/actions/pytest | ||
| with: | ||
| image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} | ||
| pytest_marks: 'gpu_2 or gpu_4' | ||
| framework: ${{ inputs.framework }} | ||
| test_type: "pre_merge_gpu" | ||
| platform_arch: ${{ inputs.platform }} | ||
| enable_mypy: 'false' # already covered by CPU tests | ||
| hf_token: ${{ secrets.HF_TOKEN }} | ||
| parallel_mode: 'none' | ||
| dind_as_sidecar: 'true' | ||
| # ============================================================================ | ||
| # COPY TO ACR | ||
| # ============================================================================ | ||
| copy-to-acr: | ||
| needs: [build, test] | ||
| # Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped) | ||
| if: | | ||
| always() && | ||
| inputs.copy_to_acr && | ||
| needs.build.result == 'success' && | ||
| (needs.test.result == 'success' || needs.test.result == 'skipped') | ||
| name: copy ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} | ||
| runs-on: prod-default-small-v2 | ||
| outputs: | ||
| target_tag_plain: ${{ needs.build.outputs.target_tag_plain }} | ||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 | ||
| - name: Calculate target tag | ||
| id: calculate-target-tag | ||
| shell: bash | ||
| run: | | ||
| CUDA_VERSION_RAW=${{ inputs.cuda_version }} | ||
| CUDA_VERSION=${CUDA_VERSION_RAW%%.*} | ||
| echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT | ||
| TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} | ||
| echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT | ||
| - name: Copy image to target registry | ||
| timeout-minutes: ${{ inputs.copy_timeout_minutes }} | ||
| uses: ./.github/actions/skopeo-copy | ||
| with: | ||
| source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com | ||
| source_image: ai-dynamo/dynamo | ||
| source_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }} | ||
| target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }} | ||
| target_image: ai-dynamo/dynamo | ||
| target_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }} | ||
| source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} | ||
| source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} | ||
| target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} | ||
| target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }} | ||
| target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} | ||