docs: add docs for DGDR usage -- golden path (#7304) #305
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| name: Release Pipeline | |
| on: | |
| push: | |
| branches: | |
| - 'release/*' | |
| workflow_dispatch: | |
| inputs: | |
| rc_number: | |
| description: 'RC number (e.g., 0 for rc0). Leave empty to auto-increment.' | |
| required: false | |
| type: string | |
| # Note: workflow_dispatch can only be triggered from release/* branches | |
| # This is enforced in the prepare-release job via branch validation | |
| permissions: | |
| contents: write | |
| env: | |
| REGISTRY_IMAGE: ai-dynamo/dynamo | |
| BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| jobs: | |
| # ============================================================================ | |
| # GATE: Approval + Version Extraction | |
| # ============================================================================ | |
| manual-approval: | |
| name: Approve Manual Run | |
| if: github.event_name == 'workflow_dispatch' | |
| runs-on: ubuntu-latest | |
| environment: automated-release | |
| steps: | |
| - name: Manual run approved | |
| run: echo "Manual workflow run approved via automated-release environment" | |
| prepare-release: | |
| name: Prepare Release | |
| runs-on: ubuntu-latest | |
| outputs: | |
| version: ${{ steps.extract.outputs.version }} | |
| image_prefix: ${{ steps.extract.outputs.image_prefix }} | |
| steps: | |
| - name: Extract version from branch | |
| id: extract | |
| run: | | |
| BRANCH_NAME="${GITHUB_REF#refs/heads/}" | |
| VERSION="${BRANCH_NAME#release/}" | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
| echo "Error: workflow_dispatch can only be triggered from release/* branches" | |
| echo "Current branch: $BRANCH_NAME" | |
| echo "Expected pattern: release/X.Y.Z (e.g., release/0.7.0)" | |
| exit 1 | |
| fi | |
| fi | |
| if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
| echo "Error: Invalid version format: $VERSION" | |
| echo "Expected format: X.Y.Z (e.g., 0.7.0)" | |
| exit 1 | |
| fi | |
| echo "version=${VERSION}" >> $GITHUB_OUTPUT | |
| echo "image_prefix=release-${VERSION}" >> $GITHUB_OUTPUT | |
| echo "Detected version: ${VERSION}" | |
| # ============================================================================ | |
| # FRAMEWORK PIPELINES (Build + Test + Distribute) | |
| # Builds amd64+arm64 images, runs tests, copies amd64 to ACR. | |
| # release-publish then copies both architectures from ECR to NGC. | |
| # | |
| # NOTE: Each job directly depends on [prepare-release, manual-approval] with | |
| # always() instead of going through an intermediate gate job. This avoids a | |
| # GitHub Actions quirk where a skipped ancestor (manual-approval on push | |
| # events) taints the entire dependency chain, causing downstream jobs to skip | |
| # even when the intermediate gate succeeds. | |
| # ============================================================================ | |
| vllm-pipeline: | |
| name: vllm builds | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml | |
| with: | |
| framework: vllm | |
| target: runtime | |
| platforms: '["amd64", "arm64"]' | |
| cuda_versions: '["12.9", "13.0"]' | |
| extra_tags: | | |
| ${{ needs.prepare-release.outputs.image_prefix }}-vllm | |
| builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| build_timeout_minutes: 120 | |
| cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0' | |
| cpu_only_test_timeout_minutes: 60 | |
| single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1' | |
| single_gpu_test_timeout_minutes: 60 | |
| multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)' | |
| multi_gpu_test_timeout_minutes: 60 | |
| secrets: inherit | |
| sglang-pipeline: | |
| name: sglang builds | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml | |
| with: | |
| framework: sglang | |
| target: runtime | |
| platforms: '["amd64", "arm64"]' | |
| cuda_versions: '["12.9", "13.0"]' | |
| extra_tags: | | |
| ${{ needs.prepare-release.outputs.image_prefix }}-sglang | |
| builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| build_timeout_minutes: 120 | |
| cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0' | |
| cpu_only_test_timeout_minutes: 60 | |
| single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1' | |
| single_gpu_test_timeout_minutes: 60 | |
| multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)' | |
| multi_gpu_test_timeout_minutes: 60 | |
| secrets: inherit | |
| trtllm-pipeline: | |
| name: trtllm builds | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml | |
| with: | |
| framework: trtllm | |
| target: runtime | |
| platforms: '["amd64", "arm64"]' | |
| cuda_versions: '["13.1"]' | |
| extra_tags: | | |
| ${{ needs.prepare-release.outputs.image_prefix }}-trtllm | |
| builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| build_timeout_minutes: 120 | |
| cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0' | |
| cpu_only_test_timeout_minutes: 60 | |
| single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1' | |
| single_gpu_test_timeout_minutes: 60 | |
| multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)' | |
| multi_gpu_test_timeout_minutes: 60 | |
| secrets: inherit | |
| # ============================================================================ | |
| # EFA PIPELINES (Build only, amd64) | |
| # ============================================================================ | |
| vllm-efa-pipeline: | |
| name: vllm EFA builds | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml | |
| with: | |
| framework: vllm | |
| target: runtime | |
| platforms: '["amd64"]' | |
| cuda_versions: '["12.9"]' | |
| make_efa: true | |
| extra_tags: | | |
| ${{ needs.prepare-release.outputs.image_prefix }}-vllm-efa | |
| builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| build_timeout_minutes: 120 | |
| cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0' | |
| cpu_only_test_timeout_minutes: 60 | |
| run_single_gpu_tests: false | |
| run_multi_gpu_tests: false | |
| copy_to_acr: false | |
| secrets: inherit | |
| trtllm-efa-pipeline: | |
| name: trtllm EFA builds | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml | |
| with: | |
| framework: trtllm | |
| target: runtime | |
| platforms: '["amd64"]' | |
| cuda_versions: '["13.1"]' | |
| make_efa: true | |
| extra_tags: | | |
| ${{ needs.prepare-release.outputs.image_prefix }}-trtllm-efa | |
| builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| build_timeout_minutes: 120 | |
| cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0' | |
| cpu_only_test_timeout_minutes: 60 | |
| run_single_gpu_tests: false | |
| run_multi_gpu_tests: false | |
| copy_to_acr: false | |
| secrets: inherit | |
| # ============================================================================ | |
| # RELEASE-SPECIFIC BUILDS | |
| # ============================================================================ | |
| operator-build: | |
| name: Build Operator Image | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| runs-on: prod-default-v2 | |
| env: | |
| IMAGE_REGISTRY: ai-dynamo | |
| IMAGE_REPOSITORY: dynamo | |
| ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com | |
| outputs: | |
| operator_tag: ${{ steps.build-and-push.outputs.operator_tag }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Initialize Dynamo Builder | |
| uses: ./.github/actions/init-dynamo-builder | |
| with: | |
| builder_name: ${{ env.BUILDER_NAME }} | |
| flavor: general | |
| all_arch: 'true' | |
| - name: Docker Login | |
| uses: ./.github/actions/docker-login | |
| with: | |
| aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} | |
| aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} | |
| azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} | |
| azure_acr_user: ${{ secrets.AZURE_ACR_USER }} | |
| azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} | |
| - name: Linter | |
| working-directory: ./deploy/operator | |
| run: docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . | |
| - name: Tester | |
| working-directory: ./deploy/operator | |
| run: docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . | |
| - name: Build and push Container | |
| id: build-and-push | |
| working-directory: ./deploy/operator | |
| run: | | |
| ECR_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}" | |
| ACR_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}" | |
| SHA_TAG="${{ github.sha }}-operator" | |
| PREFIX_TAG="${{ needs.prepare-release.outputs.image_prefix }}-operator" | |
| IMAGE_URIS=( | |
| "${ECR_BASE}:${SHA_TAG}" | |
| "${ECR_BASE}:${PREFIX_TAG}" | |
| "${ACR_BASE}:${SHA_TAG}" | |
| "${ACR_BASE}:${PREFIX_TAG}" | |
| ) | |
| echo "operator_tag=${PREFIX_TAG}" >> $GITHUB_OUTPUT | |
| TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}") | |
| docker buildx build --push --platform linux/amd64,linux/arm64 \ | |
| --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ | |
| ${TAGGING_FLAGS} -f Dockerfile . | |
| frontend-build: | |
| name: Build Frontend Images | |
| needs: [prepare-release, manual-approval] | |
| if: | | |
| always() && | |
| needs.prepare-release.result == 'success' && | |
| (github.event_name == 'push' || needs.manual-approval.result == 'success') | |
| uses: ./.github/workflows/build-frontend-image.yaml | |
| with: | |
| skip_change_detection: true | |
| image_prefix: ${{ needs.prepare-release.outputs.image_prefix }} | |
| secrets: inherit | |
| # ============================================================================ | |
| # BUILDER CLEANUP | |
| # ============================================================================ | |
| clean-k8s-builder: | |
| name: Clean K8s builder if exists | |
| runs-on: prod-default-small-v2 | |
| if: always() | |
| needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Create K8s builders (skip bootstrap) | |
| uses: ./.github/actions/bootstrap-buildkit | |
| continue-on-error: true | |
| with: | |
| builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| buildkit_worker_addresses: '' | |
| skip_bootstrap: true | |
| - name: Builder Cleanup | |
| run: docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} || true | |
| # ============================================================================ | |
| # DEPLOYMENT TESTS | |
| # ============================================================================ | |
| deploy-operator: | |
| name: Deploy Operator | |
| runs-on: prod-default-small-v2 | |
| needs: [prepare-release, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator-build] | |
| if: | | |
| always() && | |
| needs.operator-build.result == 'success' | |
| outputs: | |
| NAMESPACE: ${{ steps.deploy.outputs.namespace }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Deploy Operator | |
| id: deploy | |
| run: | | |
| set -x | |
| BRANCH="${{ github.ref_name }}" | |
| BRANCH_SANITIZED="${BRANCH//\//-}" | |
| BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}" | |
| BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}" | |
| NAMESPACE="gh-ci-${{ github.run_id }}-${BRANCH_SANITIZED}-dt" | |
| echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT" | |
| echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig | |
| chmod 600 .kubeconfig | |
| export KUBECONFIG=$(pwd)/.kubeconfig | |
| kubectl config set-context --current --namespace=$NAMESPACE | |
| kubectl create namespace $NAMESPACE | |
| kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true | |
| kubectl config set-context --current --namespace=$NAMESPACE | |
| kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $NAMESPACE || true | |
| kubectl create secret docker-registry docker-imagepullsecret \ | |
| --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} \ | |
| --docker-username=${{ secrets.AZURE_ACR_USER }} \ | |
| --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} \ | |
| --namespace=${NAMESPACE} | |
| helm repo add bitnami https://charts.bitnami.com/bitnami | |
| cd deploy/helm/charts/platform/ | |
| helm dep build . | |
| helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ | |
| --set dynamo-operator.namespaceRestriction.enabled=true \ | |
| --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \ | |
| --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ | |
| --set dynamo-operator.controllerManager.manager.image.tag=${{ needs.prepare-release.outputs.image_prefix }}-operator \ | |
| --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \ | |
| --set dynamo-operator.gpuDiscovery.enabled=false \ | |
| --set dynamo-operator.upgradeCRD=false \ | |
| --debug | |
| timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch | |
| deploy-test-vllm: | |
| if: always() && needs.deploy-operator.result == 'success' | |
| runs-on: prod-default-small-v2 | |
| needs: [deploy-operator, vllm-pipeline] | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 1 | |
| matrix: | |
| profile: [agg, agg_router, disagg, disagg_router] | |
| name: deploy-test-vllm (${{ matrix.profile }}) | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Run Dynamo Deploy Test | |
| uses: ./.github/actions/dynamo-deploy-test | |
| with: | |
| kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} | |
| namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }} | |
| framework: vllm | |
| profile: ${{ matrix.profile }} | |
| image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-amd64 | |
| platform_arch: amd64 | |
| deploy-test-sglang: | |
| if: always() && needs.deploy-operator.result == 'success' | |
| runs-on: prod-default-small-v2 | |
| needs: [deploy-operator, sglang-pipeline] | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 1 | |
| matrix: | |
| profile: [agg, agg_router] | |
| name: deploy-test-sglang (${{ matrix.profile }}) | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Run Dynamo Deploy Test | |
| uses: ./.github/actions/dynamo-deploy-test | |
| with: | |
| kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} | |
| namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }} | |
| framework: sglang | |
| profile: ${{ matrix.profile }} | |
| image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-amd64 | |
| platform_arch: amd64 | |
| deploy-test-trtllm: | |
| if: always() && needs.deploy-operator.result == 'success' | |
| runs-on: prod-default-small-v2 | |
| needs: [deploy-operator, trtllm-pipeline] | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 1 | |
| matrix: | |
| profile: [agg, agg_router, disagg, disagg_router] | |
| name: deploy-test-trtllm (${{ matrix.profile }}) | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Run Dynamo Deploy Test | |
| uses: ./.github/actions/dynamo-deploy-test | |
| with: | |
| kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} | |
| namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }} | |
| framework: trtllm | |
| profile: ${{ matrix.profile }} | |
| image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-amd64 | |
| platform_arch: amd64 | |
| deploy-cleanup: | |
| name: Cleanup AKS resources | |
| runs-on: prod-default-small-v2 | |
| if: always() | |
| needs: [deploy-operator, deploy-test-sglang, deploy-test-trtllm, deploy-test-vllm] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Cleanup | |
| timeout-minutes: 5 | |
| env: | |
| NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }} | |
| run: | | |
| if [ -z "$NAMESPACE" ]; then | |
| echo "No namespace to clean up" | |
| exit 0 | |
| fi | |
| echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig | |
| chmod 600 .kubeconfig | |
| export KUBECONFIG=$(pwd)/.kubeconfig | |
| kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" | |
| kubectl get dynamographdeployments || true | |
| kubectl get all || true | |
| kubectl delete dynamographdeployments --all -n $NAMESPACE || true | |
| helm uninstall dynamo-platform --namespace $NAMESPACE --timeout 10m || true | |
| kubectl delete namespace $NAMESPACE || true | |
| # ============================================================================ | |
| # NGC PUBLISH: RC tag, crane copy to NGC, Helm chart push | |
| # Runs after framework builds + operator + frontend complete. | |
| # Tests may fail but builds must have produced images for publishing. | |
| # ============================================================================ | |
| release-publish: | |
| name: Tag RC & Publish to NGC | |
| needs: [prepare-release, vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator-build, frontend-build] | |
| if: | | |
| always() && !cancelled() && | |
| needs.prepare-release.result == 'success' && | |
| (needs.vllm-pipeline.result == 'success' || needs.sglang-pipeline.result == 'success' || needs.trtllm-pipeline.result == 'success') | |
| runs-on: cpu-amd-m5-4xlarge | |
| environment: automated-release | |
| env: | |
| VERSION: ${{ needs.prepare-release.outputs.version }} | |
| IMAGE_PREFIX: ${{ needs.prepare-release.outputs.image_prefix }} | |
| REGISTRY_IMAGE: ai-dynamo/dynamo | |
| AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| fetch-tags: true | |
| - name: Determine next RC tag | |
| id: rc_tag | |
| env: | |
| INPUT_RC_NUMBER: ${{ github.event.inputs.rc_number }} | |
| run: | | |
| set -euo pipefail | |
| if [ -n "${INPUT_RC_NUMBER}" ]; then | |
| if ! [[ "${INPUT_RC_NUMBER}" =~ ^[0-9]+$ ]]; then | |
| echo "Error: rc_number must be a non-negative integer (got: ${INPUT_RC_NUMBER})" | |
| exit 1 | |
| fi | |
| NEXT_RC="${INPUT_RC_NUMBER}" | |
| echo "Using provided RC number: ${NEXT_RC}" | |
| else | |
| echo "No RC number provided. Auto-incrementing..." | |
| RC_PATTERN="v${VERSION}-rc" | |
| EXISTING_RCS=$(git tag -l "${RC_PATTERN}*" | grep -E "^v${VERSION}-rc[0-9]+$" | sort -V || true) | |
| if [ -z "$EXISTING_RCS" ]; then | |
| NEXT_RC=0 | |
| echo "No existing RC tags found. Starting with rc0." | |
| else | |
| LAST_RC=$(echo "$EXISTING_RCS" | tail -1) | |
| LAST_RC_NUM=${LAST_RC#v${VERSION}-rc} | |
| NEXT_RC=$((LAST_RC_NUM + 1)) | |
| echo "Found existing RC tags:" | |
| echo "$EXISTING_RCS" | |
| echo "Last RC: ${LAST_RC}, Next RC number: ${NEXT_RC}" | |
| fi | |
| fi | |
| RC_TAG="v${VERSION}-rc${NEXT_RC}" | |
| echo "rc_tag=${RC_TAG}" >> $GITHUB_OUTPUT | |
| echo "rc_number=${NEXT_RC}" >> $GITHUB_OUTPUT | |
| echo "ngc_version_tag=${VERSION}rc${NEXT_RC}" >> $GITHUB_OUTPUT | |
| echo "helm_chart_version=${VERSION}-rc${NEXT_RC}" >> $GITHUB_OUTPUT | |
| echo "Will create tag: ${RC_TAG}" | |
| - name: Create RC tag | |
| env: | |
| RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }} | |
| run: | | |
| set -euo pipefail | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git tag -a "${RC_TAG}" -m "Release candidate ${RC_TAG}" | |
| git push origin "${RC_TAG}" | |
| echo "Created and pushed tag: ${RC_TAG}" | |
| - name: Setup crane | |
| env: | |
| CRANE_VERSION: v0.20.2 | |
| run: | | |
| curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | |
| | tar -xzf - crane | |
| sudo mv crane /usr/local/bin/ | |
| crane version | |
| - name: Login to ECR | |
| run: | | |
| ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" | |
| ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" | |
| aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}" | |
| - name: Login to NGC | |
| env: | |
| NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }} | |
| run: | | |
| echo "${NGC_TOKEN}" | docker login nvcr.io -u '$oauthtoken' --password-stdin | |
| echo "${NGC_TOKEN}" | crane auth login nvcr.io -u '$oauthtoken' --password-stdin | |
| - name: Copy images to NGC | |
| id: copy_images | |
| env: | |
| NGC_REGISTRY: nvcr.io | |
| NGC_ORG: ${{ secrets.NGC_PUBLISH_ORG }} | |
| NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }} | |
| run: | | |
| set -euo pipefail | |
| SUCCESSFUL_COPIES=() | |
| FAILED_COPIES=() | |
| ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" | |
| ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" | |
| ARCHITECTURES=("amd64" "arm64") | |
| echo "========================================" | |
| echo "Copying images from ECR to NGC (registry-to-registry)" | |
| echo "NGC Version Tag: ${NGC_VERSION_TAG}" | |
| echo "========================================" | |
| copy_image() { | |
| local SRC="$1" DST="$2" LABEL="$3" | |
| echo "----------------------------------------" | |
| echo "Copying: ${LABEL}" | |
| if crane copy "${SRC}" "${DST}"; then | |
| echo " Copied: ${LABEL}" | |
| SUCCESSFUL_COPIES+=("${LABEL}") | |
| return 0 | |
| else | |
| echo " Warning: Failed to copy ${LABEL}, skipping..." | |
| FAILED_COPIES+=("${LABEL}") | |
| return 1 | |
| fi | |
| } | |
| create_manifest() { | |
| local MANIFEST="$1" AMD64_IMG="$2" ARM64_IMG="$3" LABEL="$4" | |
| echo "Creating manifest: ${MANIFEST}" | |
| docker manifest create "${MANIFEST}" "${AMD64_IMG}" "${ARM64_IMG}" || true | |
| if docker manifest push "${MANIFEST}"; then | |
| echo " Created multi-arch: ${LABEL}" | |
| SUCCESSFUL_COPIES+=("${LABEL} (multi-arch)") | |
| else | |
| echo " Failed to create multi-arch: ${LABEL}" | |
| FAILED_COPIES+=("${LABEL} (multi-arch)") | |
| fi | |
| } | |
| # ---- CUDA 12 runtime images (vllm and sglang) ---- | |
| echo "" | |
| echo "=== CUDA 12 Runtime Images (vllm, sglang) ===" | |
| CUDA12_FRAMEWORKS=("vllm" "sglang") | |
| for FRAMEWORK in "${CUDA12_FRAMEWORKS[@]}"; do | |
| NGC_NAME="${FRAMEWORK}-runtime" | |
| for ARCH in "${ARCHITECTURES[@]}"; do | |
| SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-${FRAMEWORK}-cuda12-${ARCH}" | |
| TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}" | |
| copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}" | |
| done | |
| create_manifest \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}" \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-amd64" \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-arm64" \ | |
| "${NGC_NAME}:${NGC_VERSION_TAG}" | |
| done | |
| # ---- CUDA 13 runtime images (vllm, sglang, trtllm) ---- | |
| echo "" | |
| echo "=== CUDA 13 Runtime Images (vllm, sglang, trtllm) ===" | |
| CUDA13_FRAMEWORKS=("vllm" "sglang" "trtllm") | |
| for FRAMEWORK in "${CUDA13_FRAMEWORKS[@]}"; do | |
| if [ "${FRAMEWORK}" = "trtllm" ]; then | |
| NGC_NAME="tensorrtllm-runtime" | |
| else | |
| NGC_NAME="${FRAMEWORK}-runtime" | |
| fi | |
| for ARCH in "${ARCHITECTURES[@]}"; do | |
| SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-${FRAMEWORK}-cuda13-${ARCH}" | |
| TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}" | |
| copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}" | |
| done | |
| create_manifest \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13" \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-amd64" \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-arm64" \ | |
| "${NGC_NAME}:${NGC_VERSION_TAG}-cuda13" | |
| done | |
| # ---- EFA runtime images (amd64 only, no multi-arch manifest needed) ---- | |
| echo "" | |
| echo "=== EFA Runtime Images ===" | |
| # vllm EFA (CUDA 12, amd64 only) | |
| SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-vllm-efa-cuda12-amd64" | |
| TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/vllm-runtime:${NGC_VERSION_TAG}-efa" | |
| copy_image "${SOURCE}" "${TARGET}" "vllm-runtime:${NGC_VERSION_TAG}-efa" | |
| # trtllm EFA (CUDA 13, amd64 only) | |
| SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-trtllm-efa-cuda13-amd64" | |
| TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/tensorrtllm-runtime:${NGC_VERSION_TAG}-efa" | |
| copy_image "${SOURCE}" "${TARGET}" "tensorrtllm-runtime:${NGC_VERSION_TAG}-efa" | |
| # ---- Frontend images ---- | |
| echo "" | |
| echo "=== Frontend Images ===" | |
| FRONTEND_IMAGES=() | |
| for ARCH in "${ARCHITECTURES[@]}"; do | |
| SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-frontend-${ARCH}" | |
| TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}" | |
| if copy_image "${SOURCE}" "${TARGET}" "dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"; then | |
| FRONTEND_IMAGES+=("${TARGET}") | |
| fi | |
| done | |
| if [ ${#FRONTEND_IMAGES[@]} -eq 2 ]; then | |
| create_manifest \ | |
| "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}" \ | |
| "${FRONTEND_IMAGES[0]}" "${FRONTEND_IMAGES[1]}" \ | |
| "dynamo-frontend:${NGC_VERSION_TAG}" | |
| else | |
| echo "Warning: Not all frontend architectures available, skipping multi-arch manifest" | |
| FAILED_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch - missing archs)") | |
| fi | |
| # ---- Operator image (multi-arch manifest already built by operator-build) ---- | |
| echo "" | |
| echo "=== Operator Image ===" | |
| OPERATOR_SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-operator" | |
| OPERATOR_TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/kubernetes-operator:${NGC_VERSION_TAG}" | |
| copy_image "${OPERATOR_SOURCE}" "${OPERATOR_TARGET}" "kubernetes-operator:${NGC_VERSION_TAG}" | |
| # ---- Summary ---- | |
| echo "successful_count=${#SUCCESSFUL_COPIES[@]}" >> $GITHUB_OUTPUT | |
| echo "failed_count=${#FAILED_COPIES[@]}" >> $GITHUB_OUTPUT | |
| printf '%s\n' "${SUCCESSFUL_COPIES[@]}" > /tmp/successful_copies.txt | |
| printf '%s\n' "${FAILED_COPIES[@]}" > /tmp/failed_copies.txt 2>/dev/null || true | |
| echo "========================================" | |
| echo "NGC Publishing Summary:" | |
| echo " Successful: ${#SUCCESSFUL_COPIES[@]}" | |
| echo " Failed: ${#FAILED_COPIES[@]}" | |
| echo "========================================" | |
| if [ ${#SUCCESSFUL_COPIES[@]} -eq 0 ]; then | |
| echo "ERROR: No images were successfully copied to NGC!" | |
| exit 1 | |
| fi | |
| - name: Package and push Helm charts to NGC | |
| env: | |
| NGC_HELM_REPO: https://helm.ngc.nvidia.com/${{ secrets.NGC_PUBLISH_ORG }}/ai-dynamo | |
| NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }} | |
| HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }} | |
| run: | | |
| set -euo pipefail | |
| REPO_ALIAS="ngc-staging-dynamo" | |
| helm plugin install https://github.com/chartmuseum/helm-push || true | |
| helm repo add "${REPO_ALIAS}" \ | |
| --username='$oauthtoken' \ | |
| --password="${NGC_TOKEN}" \ | |
| "${NGC_HELM_REPO}" > /dev/null 2>&1 | |
| helm repo add nats https://nats-io.github.io/k8s/helm/charts/ || true | |
| helm repo add bitnami https://charts.bitnami.com/bitnami || true | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Helm Charts" >> $GITHUB_STEP_SUMMARY | |
| PLATFORM_CHART_DIR="deploy/helm/charts/platform" | |
| CHART_NAME=$(awk '/^name:/ {print $2}' "${PLATFORM_CHART_DIR}/Chart.yaml") | |
| pushd "${PLATFORM_CHART_DIR}" | |
| helm dep build . | |
| popd | |
| echo "Packaging ${CHART_NAME} with version ${HELM_CHART_VERSION}..." | |
| helm package \ | |
| --version "${HELM_CHART_VERSION}" \ | |
| --app-version "${HELM_CHART_VERSION}" \ | |
| "${PLATFORM_CHART_DIR}" | |
| CHART_FILE="${CHART_NAME}-${HELM_CHART_VERSION}.tgz" | |
| echo "Pushing ${CHART_FILE} to NGC Helm registry..." | |
| helm cm-push "${CHART_FILE}" "${REPO_ALIAS}" | |
| echo "- \`${CHART_NAME}:${HELM_CHART_VERSION}\` pushed to NGC Helm registry" >> $GITHUB_STEP_SUMMARY | |
| helm repo remove "${REPO_ALIAS}" | |
| - name: Create release summary | |
| env: | |
| RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }} | |
| NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }} | |
| HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }} | |
| SUCCESSFUL_COUNT: ${{ steps.copy_images.outputs.successful_count }} | |
| FAILED_COUNT: ${{ steps.copy_images.outputs.failed_count }} | |
| run: | | |
| echo "## Release Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY | |
| echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Version | ${VERSION} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Git Tag | ${RC_TAG} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| NGC Version Tag | ${NGC_VERSION_TAG} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Commit | ${{ github.sha }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Branch | ${{ github.ref_name }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### NGC Publishing Results" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Successful copies**: ${SUCCESSFUL_COUNT}" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Failed copies**: ${FAILED_COUNT}" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Expected Images" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Runtime images (CUDA 12 - default):" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`vllm-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`sglang-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Runtime images (CUDA 13):" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "EFA runtime images (amd64 only):" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`vllm-runtime:${NGC_VERSION_TAG}-efa\`" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-efa\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Operator image:" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`kubernetes-operator:${NGC_VERSION_TAG}\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Frontend images:" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`dynamo-frontend:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Helm chart:" >> $GITHUB_STEP_SUMMARY | |
| echo "- \`dynamo-platform:${HELM_CHART_VERSION}\` (pushed to NGC Helm registry)" >> $GITHUB_STEP_SUMMARY | |