Skip to content

docs: add docs for DGDR usage -- golden path (#7304) #305

docs: add docs for DGDR usage -- golden path (#7304)

docs: add docs for DGDR usage -- golden path (#7304) #305

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Release Pipeline
on:
push:
branches:
- 'release/*'
workflow_dispatch:
inputs:
rc_number:
description: 'RC number (e.g., 0 for rc0). Leave empty to auto-increment.'
required: false
type: string
# Note: workflow_dispatch can only be triggered from release/* branches
# This is enforced in the prepare-release job via branch validation
permissions:
contents: write
env:
REGISTRY_IMAGE: ai-dynamo/dynamo
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
# ============================================================================
# GATE: Approval + Version Extraction
# ============================================================================
manual-approval:
name: Approve Manual Run
if: github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
environment: automated-release
steps:
- name: Manual run approved
run: echo "Manual workflow run approved via automated-release environment"
prepare-release:
name: Prepare Release
runs-on: ubuntu-latest
outputs:
version: ${{ steps.extract.outputs.version }}
image_prefix: ${{ steps.extract.outputs.image_prefix }}
steps:
- name: Extract version from branch
id: extract
run: |
BRANCH_NAME="${GITHUB_REF#refs/heads/}"
VERSION="${BRANCH_NAME#release/}"
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Error: workflow_dispatch can only be triggered from release/* branches"
echo "Current branch: $BRANCH_NAME"
echo "Expected pattern: release/X.Y.Z (e.g., release/0.7.0)"
exit 1
fi
fi
if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Error: Invalid version format: $VERSION"
echo "Expected format: X.Y.Z (e.g., 0.7.0)"
exit 1
fi
echo "version=${VERSION}" >> $GITHUB_OUTPUT
echo "image_prefix=release-${VERSION}" >> $GITHUB_OUTPUT
echo "Detected version: ${VERSION}"
# ============================================================================
# FRAMEWORK PIPELINES (Build + Test + Distribute)
# Builds amd64+arm64 images, runs tests, copies amd64 to ACR.
# release-publish then copies both architectures from ECR to NGC.
#
# NOTE: Each job directly depends on [prepare-release, manual-approval] with
# always() instead of going through an intermediate gate job. This avoids a
# GitHub Actions quirk where a skipped ancestor (manual-approval on push
# events) taints the entire dependency chain, causing downstream jobs to skip
# even when the intermediate gate succeeds.
# ============================================================================
vllm-pipeline:
name: vllm builds
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: vllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-vllm
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
cpu_only_test_timeout_minutes: 60
single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1'
single_gpu_test_timeout_minutes: 60
multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit
sglang-pipeline:
name: sglang builds
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-sglang
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0'
cpu_only_test_timeout_minutes: 60
single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1'
single_gpu_test_timeout_minutes: 60
multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit
trtllm-pipeline:
name: trtllm builds
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-trtllm
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
cpu_only_test_timeout_minutes: 60
single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1'
single_gpu_test_timeout_minutes: 60
multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# EFA PIPELINES (Build only, amd64)
# ============================================================================
vllm-efa-pipeline:
name: vllm EFA builds
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: vllm
target: runtime
platforms: '["amd64"]'
cuda_versions: '["12.9"]'
make_efa: true
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-vllm-efa
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
cpu_only_test_timeout_minutes: 60
run_single_gpu_tests: false
run_multi_gpu_tests: false
copy_to_acr: false
secrets: inherit
trtllm-efa-pipeline:
name: trtllm EFA builds
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64"]'
cuda_versions: '["13.1"]'
make_efa: true
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-trtllm-efa
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
cpu_only_test_timeout_minutes: 60
run_single_gpu_tests: false
run_multi_gpu_tests: false
copy_to_acr: false
secrets: inherit
# ============================================================================
# RELEASE-SPECIFIC BUILDS
# ============================================================================
operator-build:
name: Build Operator Image
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
runs-on: prod-default-v2
env:
IMAGE_REGISTRY: ai-dynamo
IMAGE_REPOSITORY: dynamo
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
outputs:
operator_tag: ${{ steps.build-and-push.outputs.operator_tag }}
steps:
- uses: actions/checkout@v4
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ env.BUILDER_NAME }}
flavor: general
all_arch: 'true'
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Linter
working-directory: ./deploy/operator
run: docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester
working-directory: ./deploy/operator
run: docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Build and push Container
id: build-and-push
working-directory: ./deploy/operator
run: |
ECR_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
ACR_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
SHA_TAG="${{ github.sha }}-operator"
PREFIX_TAG="${{ needs.prepare-release.outputs.image_prefix }}-operator"
IMAGE_URIS=(
"${ECR_BASE}:${SHA_TAG}"
"${ECR_BASE}:${PREFIX_TAG}"
"${ACR_BASE}:${SHA_TAG}"
"${ACR_BASE}:${PREFIX_TAG}"
)
echo "operator_tag=${PREFIX_TAG}" >> $GITHUB_OUTPUT
TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}")
docker buildx build --push --platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
${TAGGING_FLAGS} -f Dockerfile .
frontend-build:
name: Build Frontend Images
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-frontend-image.yaml
with:
skip_change_detection: true
image_prefix: ${{ needs.prepare-release.outputs.image_prefix }}
secrets: inherit
# ============================================================================
# BUILDER CLEANUP
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline]
steps:
- uses: actions/checkout@v4
- name: Create K8s builders (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
buildkit_worker_addresses: ''
skip_bootstrap: true
- name: Builder Cleanup
run: docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} || true
# ============================================================================
# DEPLOYMENT TESTS
# ============================================================================
deploy-operator:
name: Deploy Operator
runs-on: prod-default-small-v2
needs: [prepare-release, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator-build]
if: |
always() &&
needs.operator-build.result == 'success'
outputs:
NAMESPACE: ${{ steps.deploy.outputs.namespace }}
steps:
- uses: actions/checkout@v4
- name: Deploy Operator
id: deploy
run: |
set -x
BRANCH="${{ github.ref_name }}"
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
NAMESPACE="gh-ci-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
kubectl create namespace $NAMESPACE
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
kubectl config set-context --current --namespace=$NAMESPACE
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $NAMESPACE || true
kubectl create secret docker-registry docker-imagepullsecret \
--docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} \
--docker-username=${{ secrets.AZURE_ACR_USER }} \
--docker-password=${{ secrets.AZURE_ACR_PASSWORD }} \
--namespace=${NAMESPACE}
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/helm/charts/platform/
helm dep build .
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ needs.prepare-release.outputs.image_prefix }}-operator \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--set dynamo-operator.gpuDiscovery.enabled=false \
--set dynamo-operator.upgradeCRD=false \
--debug
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
deploy-test-vllm:
if: always() && needs.deploy-operator.result == 'success'
runs-on: prod-default-small-v2
needs: [deploy-operator, vllm-pipeline]
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile: [agg, agg_router, disagg, disagg_router]
name: deploy-test-vllm (${{ matrix.profile }})
steps:
- uses: actions/checkout@v4
- name: Run Dynamo Deploy Test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
framework: vllm
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-amd64
platform_arch: amd64
deploy-test-sglang:
if: always() && needs.deploy-operator.result == 'success'
runs-on: prod-default-small-v2
needs: [deploy-operator, sglang-pipeline]
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile: [agg, agg_router]
name: deploy-test-sglang (${{ matrix.profile }})
steps:
- uses: actions/checkout@v4
- name: Run Dynamo Deploy Test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
framework: sglang
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-amd64
platform_arch: amd64
deploy-test-trtllm:
if: always() && needs.deploy-operator.result == 'success'
runs-on: prod-default-small-v2
needs: [deploy-operator, trtllm-pipeline]
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile: [agg, agg_router, disagg, disagg_router]
name: deploy-test-trtllm (${{ matrix.profile }})
steps:
- uses: actions/checkout@v4
- name: Run Dynamo Deploy Test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
framework: trtllm
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-amd64
platform_arch: amd64
deploy-cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-sglang, deploy-test-trtllm, deploy-test-vllm]
steps:
- uses: actions/checkout@v4
- name: Cleanup
timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
if [ -z "$NAMESPACE" ]; then
echo "No namespace to clean up"
exit 0
fi
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl get dynamographdeployments || true
kubectl get all || true
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
helm uninstall dynamo-platform --namespace $NAMESPACE --timeout 10m || true
kubectl delete namespace $NAMESPACE || true
# ============================================================================
# NGC PUBLISH: RC tag, crane copy to NGC, Helm chart push
# Runs after framework builds + operator + frontend complete.
# Tests may fail but builds must have produced images for publishing.
# ============================================================================
release-publish:
name: Tag RC & Publish to NGC
needs: [prepare-release, vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator-build, frontend-build]
if: |
always() && !cancelled() &&
needs.prepare-release.result == 'success' &&
(needs.vllm-pipeline.result == 'success' || needs.sglang-pipeline.result == 'success' || needs.trtllm-pipeline.result == 'success')
runs-on: cpu-amd-m5-4xlarge
environment: automated-release
env:
VERSION: ${{ needs.prepare-release.outputs.version }}
IMAGE_PREFIX: ${{ needs.prepare-release.outputs.image_prefix }}
REGISTRY_IMAGE: ai-dynamo/dynamo
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Determine next RC tag
id: rc_tag
env:
INPUT_RC_NUMBER: ${{ github.event.inputs.rc_number }}
run: |
set -euo pipefail
if [ -n "${INPUT_RC_NUMBER}" ]; then
if ! [[ "${INPUT_RC_NUMBER}" =~ ^[0-9]+$ ]]; then
echo "Error: rc_number must be a non-negative integer (got: ${INPUT_RC_NUMBER})"
exit 1
fi
NEXT_RC="${INPUT_RC_NUMBER}"
echo "Using provided RC number: ${NEXT_RC}"
else
echo "No RC number provided. Auto-incrementing..."
RC_PATTERN="v${VERSION}-rc"
EXISTING_RCS=$(git tag -l "${RC_PATTERN}*" | grep -E "^v${VERSION}-rc[0-9]+$" | sort -V || true)
if [ -z "$EXISTING_RCS" ]; then
NEXT_RC=0
echo "No existing RC tags found. Starting with rc0."
else
LAST_RC=$(echo "$EXISTING_RCS" | tail -1)
LAST_RC_NUM=${LAST_RC#v${VERSION}-rc}
NEXT_RC=$((LAST_RC_NUM + 1))
echo "Found existing RC tags:"
echo "$EXISTING_RCS"
echo "Last RC: ${LAST_RC}, Next RC number: ${NEXT_RC}"
fi
fi
RC_TAG="v${VERSION}-rc${NEXT_RC}"
echo "rc_tag=${RC_TAG}" >> $GITHUB_OUTPUT
echo "rc_number=${NEXT_RC}" >> $GITHUB_OUTPUT
echo "ngc_version_tag=${VERSION}rc${NEXT_RC}" >> $GITHUB_OUTPUT
echo "helm_chart_version=${VERSION}-rc${NEXT_RC}" >> $GITHUB_OUTPUT
echo "Will create tag: ${RC_TAG}"
- name: Create RC tag
env:
RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }}
run: |
set -euo pipefail
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git tag -a "${RC_TAG}" -m "Release candidate ${RC_TAG}"
git push origin "${RC_TAG}"
echo "Created and pushed tag: ${RC_TAG}"
- name: Setup crane
env:
CRANE_VERSION: v0.20.2
run: |
curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
| tar -xzf - crane
sudo mv crane /usr/local/bin/
crane version
- name: Login to ECR
run: |
ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com"
aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
- name: Login to NGC
env:
NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }}
run: |
echo "${NGC_TOKEN}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
echo "${NGC_TOKEN}" | crane auth login nvcr.io -u '$oauthtoken' --password-stdin
- name: Copy images to NGC
id: copy_images
env:
NGC_REGISTRY: nvcr.io
NGC_ORG: ${{ secrets.NGC_PUBLISH_ORG }}
NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }}
run: |
set -euo pipefail
SUCCESSFUL_COPIES=()
FAILED_COPIES=()
ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com"
ARCHITECTURES=("amd64" "arm64")
echo "========================================"
echo "Copying images from ECR to NGC (registry-to-registry)"
echo "NGC Version Tag: ${NGC_VERSION_TAG}"
echo "========================================"
copy_image() {
local SRC="$1" DST="$2" LABEL="$3"
echo "----------------------------------------"
echo "Copying: ${LABEL}"
if crane copy "${SRC}" "${DST}"; then
echo " Copied: ${LABEL}"
SUCCESSFUL_COPIES+=("${LABEL}")
return 0
else
echo " Warning: Failed to copy ${LABEL}, skipping..."
FAILED_COPIES+=("${LABEL}")
return 1
fi
}
create_manifest() {
local MANIFEST="$1" AMD64_IMG="$2" ARM64_IMG="$3" LABEL="$4"
echo "Creating manifest: ${MANIFEST}"
docker manifest create "${MANIFEST}" "${AMD64_IMG}" "${ARM64_IMG}" || true
if docker manifest push "${MANIFEST}"; then
echo " Created multi-arch: ${LABEL}"
SUCCESSFUL_COPIES+=("${LABEL} (multi-arch)")
else
echo " Failed to create multi-arch: ${LABEL}"
FAILED_COPIES+=("${LABEL} (multi-arch)")
fi
}
# ---- CUDA 12 runtime images (vllm and sglang) ----
echo ""
echo "=== CUDA 12 Runtime Images (vllm, sglang) ==="
CUDA12_FRAMEWORKS=("vllm" "sglang")
for FRAMEWORK in "${CUDA12_FRAMEWORKS[@]}"; do
NGC_NAME="${FRAMEWORK}-runtime"
for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-${FRAMEWORK}-cuda12-${ARCH}"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}"
copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}"
done
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-amd64" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-arm64" \
"${NGC_NAME}:${NGC_VERSION_TAG}"
done
# ---- CUDA 13 runtime images (vllm, sglang, trtllm) ----
echo ""
echo "=== CUDA 13 Runtime Images (vllm, sglang, trtllm) ==="
CUDA13_FRAMEWORKS=("vllm" "sglang" "trtllm")
for FRAMEWORK in "${CUDA13_FRAMEWORKS[@]}"; do
if [ "${FRAMEWORK}" = "trtllm" ]; then
NGC_NAME="tensorrtllm-runtime"
else
NGC_NAME="${FRAMEWORK}-runtime"
fi
for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-${FRAMEWORK}-cuda13-${ARCH}"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}"
copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}"
done
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-amd64" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-arm64" \
"${NGC_NAME}:${NGC_VERSION_TAG}-cuda13"
done
# ---- EFA runtime images (amd64 only, no multi-arch manifest needed) ----
echo ""
echo "=== EFA Runtime Images ==="
# vllm EFA (CUDA 12, amd64 only)
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-vllm-efa-cuda12-amd64"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/vllm-runtime:${NGC_VERSION_TAG}-efa"
copy_image "${SOURCE}" "${TARGET}" "vllm-runtime:${NGC_VERSION_TAG}-efa"
# trtllm EFA (CUDA 13, amd64 only)
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-trtllm-efa-cuda13-amd64"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/tensorrtllm-runtime:${NGC_VERSION_TAG}-efa"
copy_image "${SOURCE}" "${TARGET}" "tensorrtllm-runtime:${NGC_VERSION_TAG}-efa"
# ---- Frontend images ----
echo ""
echo "=== Frontend Images ==="
FRONTEND_IMAGES=()
for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-frontend-${ARCH}"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"
if copy_image "${SOURCE}" "${TARGET}" "dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"; then
FRONTEND_IMAGES+=("${TARGET}")
fi
done
if [ ${#FRONTEND_IMAGES[@]} -eq 2 ]; then
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}" \
"${FRONTEND_IMAGES[0]}" "${FRONTEND_IMAGES[1]}" \
"dynamo-frontend:${NGC_VERSION_TAG}"
else
echo "Warning: Not all frontend architectures available, skipping multi-arch manifest"
FAILED_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch - missing archs)")
fi
# ---- Operator image (multi-arch manifest already built by operator-build) ----
echo ""
echo "=== Operator Image ==="
OPERATOR_SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-operator"
OPERATOR_TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/kubernetes-operator:${NGC_VERSION_TAG}"
copy_image "${OPERATOR_SOURCE}" "${OPERATOR_TARGET}" "kubernetes-operator:${NGC_VERSION_TAG}"
# ---- Summary ----
echo "successful_count=${#SUCCESSFUL_COPIES[@]}" >> $GITHUB_OUTPUT
echo "failed_count=${#FAILED_COPIES[@]}" >> $GITHUB_OUTPUT
printf '%s\n' "${SUCCESSFUL_COPIES[@]}" > /tmp/successful_copies.txt
printf '%s\n' "${FAILED_COPIES[@]}" > /tmp/failed_copies.txt 2>/dev/null || true
echo "========================================"
echo "NGC Publishing Summary:"
echo " Successful: ${#SUCCESSFUL_COPIES[@]}"
echo " Failed: ${#FAILED_COPIES[@]}"
echo "========================================"
if [ ${#SUCCESSFUL_COPIES[@]} -eq 0 ]; then
echo "ERROR: No images were successfully copied to NGC!"
exit 1
fi
- name: Package and push Helm charts to NGC
env:
NGC_HELM_REPO: https://helm.ngc.nvidia.com/${{ secrets.NGC_PUBLISH_ORG }}/ai-dynamo
NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }}
HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }}
run: |
set -euo pipefail
REPO_ALIAS="ngc-staging-dynamo"
helm plugin install https://github.com/chartmuseum/helm-push || true
helm repo add "${REPO_ALIAS}" \
--username='$oauthtoken' \
--password="${NGC_TOKEN}" \
"${NGC_HELM_REPO}" > /dev/null 2>&1
helm repo add nats https://nats-io.github.io/k8s/helm/charts/ || true
helm repo add bitnami https://charts.bitnami.com/bitnami || true
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Helm Charts" >> $GITHUB_STEP_SUMMARY
PLATFORM_CHART_DIR="deploy/helm/charts/platform"
CHART_NAME=$(awk '/^name:/ {print $2}' "${PLATFORM_CHART_DIR}/Chart.yaml")
pushd "${PLATFORM_CHART_DIR}"
helm dep build .
popd
echo "Packaging ${CHART_NAME} with version ${HELM_CHART_VERSION}..."
helm package \
--version "${HELM_CHART_VERSION}" \
--app-version "${HELM_CHART_VERSION}" \
"${PLATFORM_CHART_DIR}"
CHART_FILE="${CHART_NAME}-${HELM_CHART_VERSION}.tgz"
echo "Pushing ${CHART_FILE} to NGC Helm registry..."
helm cm-push "${CHART_FILE}" "${REPO_ALIAS}"
echo "- \`${CHART_NAME}:${HELM_CHART_VERSION}\` pushed to NGC Helm registry" >> $GITHUB_STEP_SUMMARY
helm repo remove "${REPO_ALIAS}"
- name: Create release summary
env:
RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }}
NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }}
HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }}
SUCCESSFUL_COUNT: ${{ steps.copy_images.outputs.successful_count }}
FAILED_COUNT: ${{ steps.copy_images.outputs.failed_count }}
run: |
echo "## Release Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
echo "| Version | ${VERSION} |" >> $GITHUB_STEP_SUMMARY
echo "| Git Tag | ${RC_TAG} |" >> $GITHUB_STEP_SUMMARY
echo "| NGC Version Tag | ${NGC_VERSION_TAG} |" >> $GITHUB_STEP_SUMMARY
echo "| Commit | ${{ github.sha }} |" >> $GITHUB_STEP_SUMMARY
echo "| Branch | ${{ github.ref_name }} |" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### NGC Publishing Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **Successful copies**: ${SUCCESSFUL_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "- **Failed copies**: ${FAILED_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Expected Images" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Runtime images (CUDA 12 - default):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Runtime images (CUDA 13):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "EFA runtime images (amd64 only):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-efa\`" >> $GITHUB_STEP_SUMMARY
echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-efa\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Operator image:" >> $GITHUB_STEP_SUMMARY
echo "- \`kubernetes-operator:${NGC_VERSION_TAG}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Frontend images:" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-frontend:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Helm chart:" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-platform:${HELM_CHART_VERSION}\` (pushed to NGC Helm registry)" >> $GITHUB_STEP_SUMMARY