Skip to content

docs: use GitHub admonitions instead of Fern-native callouts (#7370) #696

docs: use GitHub admonitions instead of Fern-native callouts (#7370)

docs: use GitHub admonitions instead of Fern-native callouts (#7370) #696

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Post-Merge CI Pipeline
on:
push:
branches:
- main
- 'release/*.*.*'
permissions:
contents: read
jobs:
# ============================================================================
# FRAMEWORK PIPELINES (Build → Test → Copy)
# ============================================================================
# ============================================================================
# VLLM PIPELINE
# ============================================================================
vllm-pipeline:
name: vllm
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: vllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
copy_timeout_minutes: 20
cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1'
multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)'
cpu_only_test_timeout_minutes: 60
single_gpu_test_timeout_minutes: 60
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# SGLANG PIPELINE
# ============================================================================
sglang-pipeline:
name: sglang
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
copy_timeout_minutes: 20
cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0'
single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1'
multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)'
cpu_only_test_timeout_minutes: 60
single_gpu_test_timeout_minutes: 60
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# TRTLLM PIPELINE
# ============================================================================
trtllm-pipeline:
name: trtllm
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
copy_timeout_minutes: 20
cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1'
multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)'
cpu_only_test_timeout_minutes: 60
single_gpu_test_timeout_minutes: 60
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# DEV PIPELINES
# ============================================================================
vllm-dev-pipeline:
name: vllm-dev
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: vllm
target: dev
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 60
run_cpu_only_tests: false
run_single_gpu_tests: false
run_multi_gpu_tests: false
secrets: inherit
sglang-dev-pipeline:
name: sglang-dev
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: dev
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 60
run_cpu_only_tests: false
run_single_gpu_tests: false
run_multi_gpu_tests: false
secrets: inherit
trtllm-dev-pipeline:
name: trtllm-dev
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: dev
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 60
run_cpu_only_tests: false
run_single_gpu_tests: false
run_multi_gpu_tests: false
secrets: inherit
# ============================================================================
# EFA PIPELINES (Build only, amd64)
# ============================================================================
# ============================================================================
# VLLM EFA PIPELINE
# ============================================================================
vllm-efa-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: vllm
target: runtime
platforms: '["amd64"]'
cuda_versions: '["12.9"]'
make_efa: true
extra_tags: |
${{ github.ref_name == 'main' && 'main-vllm-efa' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-efa-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
copy_timeout_minutes: 20
cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
cpu_only_test_timeout_minutes: 60
run_single_gpu_tests: false
run_multi_gpu_tests: false
copy_to_acr: false
secrets: inherit
# ============================================================================
# TRTLLM EFA PIPELINE
# ============================================================================
trtllm-efa-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64"]'
cuda_versions: '["13.1"]'
make_efa: true
extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm-efa' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-efa-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
copy_timeout_minutes: 20
cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
cpu_only_test_timeout_minutes: 60
run_single_gpu_tests: false
run_multi_gpu_tests: false
copy_to_acr: false
secrets: inherit
# ============================================================================
# Operator
# ============================================================================
operator:
name: Operator
runs-on: prod-default-v2
env:
IMAGE_REGISTRY: ai-dynamo
IMAGE_REPOSITORY: dynamo
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
outputs:
operator_default_tag: ${{ steps.build-and-push-image.outputs.operator_default_tag }}
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
flavor: general
all_arch: 'true'
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Linter
shell: bash
working-directory: ./deploy/operator
run: |
docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester
shell: bash
working-directory: ./deploy/operator
run: |
docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
with:
go-version: '1.25'
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Python dependencies for operator codegen
shell: bash
working-directory: ./deploy/operator
run: |
python -m pip install --upgrade pip
python -m pip install "pydantic>=2,<3" "black==23.1.0" "pyyaml>=6.0"
- name: Check for uncommitted changes
shell: bash
working-directory: ./deploy/operator
run: |
make check
- name: Build and push Container
id: build-and-push-image
shell: bash
working-directory: ./deploy/operator
env:
NO_CACHE_FLAG: '' # placeholder for future logic to add no cache flag if needed
run: |
ECR_DEFAULT_IMAGE_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
DEFAULT_TAG="${{ github.sha }}-operator"
ACR_IMAGE_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
IMAGE_URIS=(
"${ECR_DEFAULT_IMAGE_BASE}:${DEFAULT_TAG}"
"${ACR_IMAGE_BASE}:${DEFAULT_TAG}"
)
if [[ "${{ github.ref_name }}" == "main" ]]; then
IMAGE_URIS+=(
"${ECR_DEFAULT_IMAGE_BASE}:main-operator"
"${ACR_IMAGE_BASE}:main-operator"
)
fi
echo "operator_default_tag=${DEFAULT_TAG}" >> $GITHUB_OUTPUT
TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}")
echo "flags for docker buildx: ${TAGGING_FLAGS}"
if [[ "$NO_CACHE_FLAG" == "true" ]]; then
NO_CACHE_FLAG="--no-cache"
fi
docker buildx build --push ${NO_CACHE_FLAG} \
--platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
${TAGGING_FLAGS} -f Dockerfile .
echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY
for image_uri in "${IMAGE_URIS[@]}"; do
echo "| \`${image_uri}\` |" >> $GITHUB_STEP_SUMMARY
done
# ============================================================================
# DEPLOYMENT JOBS
# Deploy operator and run end-to-end tests on Kubernetes cluster
# ============================================================================
deploy-operator:
runs-on: prod-default-small-v2
needs: [operator]
outputs:
NAMESPACE: ${{ steps.namespace.outputs.namespace }}
OPERATOR_TAG: ${{ steps.operator-tag.outputs.tag }}
steps:
- uses: actions/checkout@v4
- name: Determine operator tag
id: operator-tag
run: |
if [ "${{ needs.operator.result }}" == "success" ]; then
TAG="${{ needs.operator.outputs.operator_default_tag }}"
else
TAG="main-operator"
fi
echo "tag=${TAG}" >> $GITHUB_OUTPUT
echo "Using operator tag: ${TAG}"
- name: Generate namespace name
id: namespace
env:
BRANCH: ${{ github.ref_name }}
run: |
# Sanitize branch name for k8s namespace
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
- name: Setup namespace and operator
uses: ./.github/actions/setup-deploy-namespace
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ steps.namespace.outputs.namespace }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ steps.operator-tag.outputs.tag }}
hf_token: ${{ secrets.HF_TOKEN }}
# ============================================================================
# End-to-end tests for each framework with various deployment profiles
# ============================================================================
deploy-test-vllm:
runs-on: prod-default-small-v2
needs: [deploy-operator, vllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64
platform_arch: amd64
deploy-test-sglang:
runs-on: prod-default-small-v2
needs: [deploy-operator, sglang-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
name: deploy-test-sglang (${{ matrix.profile }})
env:
FRAMEWORK: sglang
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-runtime-cuda12-amd64
platform_arch: amd64
deploy-test-trtllm:
runs-on: prod-default-small-v2
needs: [deploy-operator, trtllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
# Disabled: trtllm disagg profiles consistently timeout (~32 min) with 0% success rate.
# Re-enable once the underlying disagg deployment issue is resolved.
# - disagg
# - disagg_router
name: deploy-test-trtllm (${{ matrix.profile }})
env:
FRAMEWORK: trtllm
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-runtime-cuda13-amd64
platform_arch: amd64
deploy-status-check:
runs-on: ubuntu-latest
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
if: always()
steps:
- name: "Check all deploy test jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
# ============================================================================
# CLEANUP JOBS
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator]
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Create K8s builders (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
buildkit_worker_addresses: '' # k8s builder
skip_bootstrap: true
- name: Builder Cleanup in case of k8s builder
shell: bash
run: |
docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} || true
cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Teardown namespace
if: needs.deploy-operator.outputs.NAMESPACE != ''
uses: ./.github/actions/teardown-deploy-namespace
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-builder-amd-v1
if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm ]
permissions:
contents: read
steps:
- name: Get Failed jobs
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | ":failed: " + (.name | split(" / ") | .[-1]) + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
- name: Notify Slack
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
with:
webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type: incoming-webhook
payload: |
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Post-merge Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."