Skip to content

Release Pipeline

Release Pipeline #312

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Release Pipeline
on:
workflow_dispatch:
inputs:
commit_sha:
description: 'Git commit SHA whose post-merge CI images to publish (full 40-char SHA).'
required: true
type: string
rc_number:
description: 'RC number (e.g., 0 for rc0). Leave empty to auto-increment.'
required: false
type: string
# Note: workflow_dispatch can only be triggered from release/* branches
# This is enforced in the prepare-release job via branch validation
permissions:
contents: write
env:
REGISTRY_IMAGE: ai-dynamo/dynamo
jobs:
# ============================================================================
# GATE: Version Extraction
# ============================================================================
prepare-release:
name: Prepare Release
runs-on: prod-default-small-v2
outputs:
version: ${{ steps.extract.outputs.version }}
commit_sha: ${{ steps.extract.outputs.commit_sha }}
steps:
- name: Extract version and validate inputs
id: extract
env:
COMMIT_SHA: ${{ github.event.inputs.commit_sha }}
BRANCH_NAME: ${{ github.ref_name }}
run: |
set -euo pipefail
if ! [[ "${COMMIT_SHA}" =~ ^[0-9a-f]{40}$ ]]; then
echo "Error: commit_sha must be a full 40-character hex SHA (got: '${COMMIT_SHA}')"
exit 1
fi
if [[ "$BRANCH_NAME" == "pvijayakrish/use-pat-for-rc-tagging" ]]; then
VERSION="0.0.0test1"
echo "Devel branch detected — using test version: ${VERSION}"
elif [[ "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+ ]]; then
VERSION="${BRANCH_NAME#release/}"
else
echo "Error: workflow_dispatch must be triggered from a release/* branch"
echo "Current branch: $BRANCH_NAME"
exit 1
fi
echo "version=${VERSION}" >> $GITHUB_OUTPUT
echo "commit_sha=${COMMIT_SHA}" >> $GITHUB_OUTPUT
echo "Detected version: ${VERSION}"
echo "Source commit SHA: ${COMMIT_SHA}"
# ============================================================================
# NGC PUBLISH: RC tag, crane copy to NGC, Helm chart push
# Sources images from ECR using SHA-based tags produced by post-merge CI.
# ============================================================================
release-publish:
name: Tag RC & Publish to NGC
needs: [prepare-release]
if: needs.prepare-release.result == 'success'
runs-on: prod-builder-amd-v1
environment: automated-release
env:
VERSION: ${{ needs.prepare-release.outputs.version }}
COMMIT_SHA: ${{ needs.prepare-release.outputs.commit_sha }}
REGISTRY_IMAGE: ai-dynamo/dynamo
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
steps:
- name: Checkout at source commit
uses: actions/checkout@v4
with:
ref: ${{ needs.prepare-release.outputs.commit_sha }}
fetch-depth: 0
fetch-tags: true
token: ${{ secrets.RC_GITHUB_PAT }}
- name: Determine next RC tag
id: rc_tag
env:
INPUT_RC_NUMBER: ${{ github.event.inputs.rc_number }}
run: |
set -euo pipefail
if [ -n "${INPUT_RC_NUMBER}" ]; then
if ! [[ "${INPUT_RC_NUMBER}" =~ ^[0-9]+$ ]]; then
echo "Error: rc_number must be a non-negative integer (got: ${INPUT_RC_NUMBER})"
exit 1
fi
NEXT_RC="${INPUT_RC_NUMBER}"
echo "Using provided RC number: ${NEXT_RC}"
else
echo "No RC number provided. Auto-incrementing..."
RC_PATTERN="v${VERSION}-rc"
EXISTING_RCS=$(git tag -l "${RC_PATTERN}*" | grep -E "^v${VERSION}-rc[0-9]+$" | sort -V || true)
if [ -z "$EXISTING_RCS" ]; then
NEXT_RC=0
echo "No existing RC tags found. Starting with rc0."
else
LAST_RC=$(echo "$EXISTING_RCS" | tail -1)
LAST_RC_NUM=${LAST_RC#v${VERSION}-rc}
NEXT_RC=$((LAST_RC_NUM + 1))
echo "Found existing RC tags:"
echo "$EXISTING_RCS"
echo "Last RC: ${LAST_RC}, Next RC number: ${NEXT_RC}"
fi
fi
RC_TAG="v${VERSION}-rc${NEXT_RC}"
echo "rc_tag=${RC_TAG}" >> $GITHUB_OUTPUT
echo "rc_number=${NEXT_RC}" >> $GITHUB_OUTPUT
echo "ngc_version_tag=${VERSION}rc${NEXT_RC}" >> $GITHUB_OUTPUT
echo "helm_chart_version=${VERSION}-rc${NEXT_RC}" >> $GITHUB_OUTPUT
echo "Will create tag: ${RC_TAG}"
- name: Create RC tag
env:
RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }}
run: |
set -euo pipefail
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git tag -a "${RC_TAG}" -m "Release candidate ${RC_TAG}"
git push origin "${RC_TAG}"
echo "Created and pushed tag: ${RC_TAG}"
- name: Setup crane
env:
CRANE_VERSION: v0.20.2
run: |
curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
| tar -xzf - crane
sudo mv crane /usr/local/bin/
crane version
- name: Login to ECR
run: |
ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com"
aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
- name: Login to NGC
env:
NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }}
NGC_USERNAME: ${{ secrets.NGC_PUBLISH_USERNAME }}
run: |
echo "${NGC_TOKEN}" | docker login nvcr.io -u "${NGC_USERNAME}" --password-stdin
echo "${NGC_TOKEN}" | crane auth login nvcr.io -u "${NGC_USERNAME}" --password-stdin
- name: Copy images to NGC
id: copy_images
env:
NGC_REGISTRY: nvcr.io
NGC_ORG: ${{ secrets.NGC_PUBLISH_ORG }}
NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }}
run: |
set -euo pipefail
SUCCESSFUL_COPIES=()
FAILED_COPIES=()
ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com"
ARCHITECTURES=("amd64" "arm64")
echo "========================================"
echo "Copying images from ECR to NGC (registry-to-registry)"
echo "Source commit SHA: ${COMMIT_SHA}"
echo "NGC Version Tag: ${NGC_VERSION_TAG}"
echo "========================================"
copy_image() {
local SRC="$1" DST="$2" LABEL="$3"
echo "----------------------------------------"
echo "Copying: ${LABEL}"
if crane copy "${SRC}" "${DST}"; then
echo " Copied: ${LABEL}"
SUCCESSFUL_COPIES+=("${LABEL}")
return 0
else
echo " Warning: Failed to copy ${LABEL}, skipping..."
FAILED_COPIES+=("${LABEL}")
return 1
fi
}
create_manifest() {
local MANIFEST="$1" AMD64_IMG="$2" ARM64_IMG="$3" LABEL="$4"
echo "Creating manifest: ${MANIFEST}"
docker manifest create "${MANIFEST}" "${AMD64_IMG}" "${ARM64_IMG}" || true
if docker manifest push "${MANIFEST}"; then
echo " Created multi-arch: ${LABEL}"
SUCCESSFUL_COPIES+=("${LABEL} (multi-arch)")
else
echo " Failed to create multi-arch: ${LABEL}"
FAILED_COPIES+=("${LABEL} (multi-arch)")
fi
}
# ---- CUDA 12 runtime images (vllm and sglang) ----
echo ""
echo "=== CUDA 12 Runtime Images (vllm, sglang) ==="
CUDA12_FRAMEWORKS=("vllm" "sglang")
for FRAMEWORK in "${CUDA12_FRAMEWORKS[@]}"; do
NGC_NAME="${FRAMEWORK}-runtime"
for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${COMMIT_SHA}-${FRAMEWORK}-cuda12-${ARCH}"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}"
copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}"
done
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-amd64" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-arm64" \
"${NGC_NAME}:${NGC_VERSION_TAG}"
done
# ---- CUDA 13 runtime images (vllm, sglang, trtllm) ----
echo ""
echo "=== CUDA 13 Runtime Images (vllm, sglang, trtllm) ==="
CUDA13_FRAMEWORKS=("vllm" "sglang" "trtllm")
for FRAMEWORK in "${CUDA13_FRAMEWORKS[@]}"; do
if [ "${FRAMEWORK}" = "trtllm" ]; then
NGC_NAME="tensorrtllm-runtime"
else
NGC_NAME="${FRAMEWORK}-runtime"
fi
for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${COMMIT_SHA}-${FRAMEWORK}-cuda13-${ARCH}"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}"
copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}"
done
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-amd64" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-arm64" \
"${NGC_NAME}:${NGC_VERSION_TAG}-cuda13"
done
# ---- EFA runtime images (amd64 only, no multi-arch manifest needed) ----
echo ""
echo "=== EFA Runtime Images ==="
# vllm EFA (CUDA 12, amd64 only)
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${COMMIT_SHA}-vllm-efa-cuda12-amd64"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/vllm-runtime:${NGC_VERSION_TAG}-efa"
copy_image "${SOURCE}" "${TARGET}" "vllm-runtime:${NGC_VERSION_TAG}-efa"
# trtllm EFA (CUDA 13, amd64 only)
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${COMMIT_SHA}-trtllm-efa-cuda13-amd64"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/tensorrtllm-runtime:${NGC_VERSION_TAG}-efa"
copy_image "${SOURCE}" "${TARGET}" "tensorrtllm-runtime:${NGC_VERSION_TAG}-efa"
# ---- Frontend images ----
echo ""
echo "=== Frontend Images ==="
FRONTEND_IMAGES=()
for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${COMMIT_SHA}-frontend-${ARCH}"
TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"
if copy_image "${SOURCE}" "${TARGET}" "dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"; then
FRONTEND_IMAGES+=("${TARGET}")
fi
done
if [ ${#FRONTEND_IMAGES[@]} -eq 2 ]; then
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}" \
"${FRONTEND_IMAGES[0]}" "${FRONTEND_IMAGES[1]}" \
"dynamo-frontend:${NGC_VERSION_TAG}"
else
echo "Warning: Not all frontend architectures available, skipping multi-arch manifest"
FAILED_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch - missing archs)")
fi
# ---- Operator image (multi-arch manifest already built by post-merge operator-build) ----
echo ""
echo "=== Operator Image ==="
OPERATOR_SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${COMMIT_SHA}-operator"
OPERATOR_TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/kubernetes-operator:${NGC_VERSION_TAG}"
copy_image "${OPERATOR_SOURCE}" "${OPERATOR_TARGET}" "kubernetes-operator:${NGC_VERSION_TAG}"
# ---- Summary ----
echo "successful_count=${#SUCCESSFUL_COPIES[@]}" >> $GITHUB_OUTPUT
echo "failed_count=${#FAILED_COPIES[@]}" >> $GITHUB_OUTPUT
printf '%s\n' "${SUCCESSFUL_COPIES[@]}" > /tmp/successful_copies.txt
printf '%s\n' "${FAILED_COPIES[@]}" > /tmp/failed_copies.txt 2>/dev/null || true
echo "========================================"
echo "NGC Publishing Summary:"
echo " Successful: ${#SUCCESSFUL_COPIES[@]}"
echo " Failed: ${#FAILED_COPIES[@]}"
echo "========================================"
if [ ${#SUCCESSFUL_COPIES[@]} -eq 0 ]; then
echo "ERROR: No images were successfully copied to NGC!"
exit 1
fi
- name: Package and push Helm charts to NGC
env:
NGC_HELM_REPO: https://helm.ngc.nvidia.com/${{ secrets.NGC_PUBLISH_ORG }}/ai-dynamo
NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }}
NGC_USERNAME: ${{ secrets.NGC_PUBLISH_USERNAME }}
HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }}
run: |
set -euo pipefail
REPO_ALIAS="ngc-staging-dynamo"
helm plugin install https://github.com/chartmuseum/helm-push || true
echo "${NGC_TOKEN}" | helm repo add "${REPO_ALIAS}" \
--username="${NGC_USERNAME}" \
--password-stdin \
"${NGC_HELM_REPO}" > /dev/null 2>&1
helm repo add nats https://nats-io.github.io/k8s/helm/charts/ || true
helm repo add bitnami https://charts.bitnami.com/bitnami || true
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Helm Charts" >> $GITHUB_STEP_SUMMARY
PLATFORM_CHART_DIR="deploy/helm/charts/platform"
CHART_NAME=$(awk '/^name:/ {print $2}' "${PLATFORM_CHART_DIR}/Chart.yaml")
pushd "${PLATFORM_CHART_DIR}"
helm dep build .
popd
echo "Packaging ${CHART_NAME} with version ${HELM_CHART_VERSION}..."
helm package \
--version "${HELM_CHART_VERSION}" \
--app-version "${HELM_CHART_VERSION}" \
"${PLATFORM_CHART_DIR}"
CHART_FILE="${CHART_NAME}-${HELM_CHART_VERSION}.tgz"
echo "Pushing ${CHART_FILE} to NGC Helm registry..."
helm cm-push "${CHART_FILE}" "${REPO_ALIAS}"
echo "- \`${CHART_NAME}:${HELM_CHART_VERSION}\` pushed to NGC Helm registry" >> $GITHUB_STEP_SUMMARY
helm repo remove "${REPO_ALIAS}"
- name: Create release summary
env:
RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }}
NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }}
HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }}
SUCCESSFUL_COUNT: ${{ steps.copy_images.outputs.successful_count }}
FAILED_COUNT: ${{ steps.copy_images.outputs.failed_count }}
run: |
echo "## Release Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
echo "| Version | ${VERSION} |" >> $GITHUB_STEP_SUMMARY
echo "| Git Tag | ${RC_TAG} |" >> $GITHUB_STEP_SUMMARY
echo "| NGC Version Tag | ${NGC_VERSION_TAG} |" >> $GITHUB_STEP_SUMMARY
echo "| Source Commit SHA | ${COMMIT_SHA} |" >> $GITHUB_STEP_SUMMARY
echo "| Branch | ${{ github.ref_name }} |" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### NGC Publishing Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **Successful copies**: ${SUCCESSFUL_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "- **Failed copies**: ${FAILED_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Expected Images" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Runtime images (CUDA 12 - default):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Runtime images (CUDA 13):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "EFA runtime images (amd64 only):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-efa\`" >> $GITHUB_STEP_SUMMARY
echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-efa\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Operator image:" >> $GITHUB_STEP_SUMMARY
echo "- \`kubernetes-operator:${NGC_VERSION_TAG}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Frontend images:" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-frontend:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Helm chart:" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-platform:${HELM_CHART_VERSION}\` (pushed to NGC Helm registry)" >> $GITHUB_STEP_SUMMARY