docs: use GitHub admonitions instead of Fern-native callouts (#7370) #696

Workflow file for this run

.github/workflows/post-merge-ci.yml at 34f13a1

	# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0

	name: Post-Merge CI Pipeline

	on:
	push:
	branches:
	- main
	- 'release/..*'

	permissions:
	contents: read

	jobs:
	# ============================================================================
	# FRAMEWORK PIPELINES (Build → Test → Copy)
	# ============================================================================
	# ============================================================================
	# VLLM PIPELINE
	# ============================================================================
	vllm-pipeline:
	name: vllm
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: vllm
	target: runtime
	platforms: '["amd64", "arm64"]'
	cuda_versions: '["12.9", "13.0"]'
	extra_tags: \|
	${{ github.ref_name == 'main' && 'main-vllm' \|\| '' }}
	${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) \|\| '' }}
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 120
	copy_timeout_minutes: 20
	cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
	single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1'
	multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)'
	cpu_only_test_timeout_minutes: 60
	single_gpu_test_timeout_minutes: 60
	multi_gpu_test_timeout_minutes: 60
	secrets: inherit

	# ============================================================================
	# SGLANG PIPELINE
	# ============================================================================
	sglang-pipeline:
	name: sglang
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: sglang
	target: runtime
	platforms: '["amd64", "arm64"]'
	cuda_versions: '["12.9", "13.0"]'
	extra_tags: \|
	${{ github.ref_name == 'main' && 'main-sglang' \|\| '' }}
	${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) \|\| '' }}
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 120
	copy_timeout_minutes: 20
	cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0'
	single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1'
	multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)'
	cpu_only_test_timeout_minutes: 60
	single_gpu_test_timeout_minutes: 60
	multi_gpu_test_timeout_minutes: 60
	secrets: inherit

	# ============================================================================
	# TRTLLM PIPELINE
	# ============================================================================
	trtllm-pipeline:
	name: trtllm
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: trtllm
	target: runtime
	platforms: '["amd64", "arm64"]'
	cuda_versions: '["13.1"]'
	extra_tags: \|
	${{ github.ref_name == 'main' && 'main-trtllm' \|\| '' }}
	${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) \|\| '' }}
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 120
	copy_timeout_minutes: 20
	cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
	single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1'
	multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)'
	cpu_only_test_timeout_minutes: 60
	single_gpu_test_timeout_minutes: 60
	multi_gpu_test_timeout_minutes: 60
	secrets: inherit

	# ============================================================================
	# DEV PIPELINES
	# ============================================================================
	vllm-dev-pipeline:
	name: vllm-dev
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: vllm
	target: dev
	platforms: '["amd64", "arm64"]'
	cuda_versions: '["12.9", "13.0"]'
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 60
	run_cpu_only_tests: false
	run_single_gpu_tests: false
	run_multi_gpu_tests: false
	secrets: inherit

	sglang-dev-pipeline:
	name: sglang-dev
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: sglang
	target: dev
	platforms: '["amd64", "arm64"]'
	cuda_versions: '["12.9", "13.0"]'
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 60
	run_cpu_only_tests: false
	run_single_gpu_tests: false
	run_multi_gpu_tests: false
	secrets: inherit

	trtllm-dev-pipeline:
	name: trtllm-dev
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: trtllm
	target: dev
	platforms: '["amd64", "arm64"]'
	cuda_versions: '["13.1"]'
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 60
	run_cpu_only_tests: false
	run_single_gpu_tests: false
	run_multi_gpu_tests: false
	secrets: inherit

	# ============================================================================
	# EFA PIPELINES (Build only, amd64)
	# ============================================================================
	# ============================================================================
	# VLLM EFA PIPELINE
	# ============================================================================
	vllm-efa-pipeline:
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: vllm
	target: runtime
	platforms: '["amd64"]'
	cuda_versions: '["12.9"]'
	make_efa: true
	extra_tags: \|
	${{ github.ref_name == 'main' && 'main-vllm-efa' \|\| '' }}
	${{ github.ref_name == 'main' && format('main-vllm-efa-{0}', github.sha) \|\| '' }}
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 120
	copy_timeout_minutes: 20
	cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
	cpu_only_test_timeout_minutes: 60
	run_single_gpu_tests: false
	run_multi_gpu_tests: false
	copy_to_acr: false
	secrets: inherit

	# ============================================================================
	# TRTLLM EFA PIPELINE
	# ============================================================================
	trtllm-efa-pipeline:
	uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
	with:
	framework: trtllm
	target: runtime
	platforms: '["amd64"]'
	cuda_versions: '["13.1"]'
	make_efa: true
	extra_tags: \|
	${{ github.ref_name == 'main' && 'main-trtllm-efa' \|\| '' }}
	${{ github.ref_name == 'main' && format('main-trtllm-efa-{0}', github.sha) \|\| '' }}
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	build_timeout_minutes: 120
	copy_timeout_minutes: 20
	cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
	cpu_only_test_timeout_minutes: 60
	run_single_gpu_tests: false
	run_multi_gpu_tests: false
	copy_to_acr: false
	secrets: inherit

	# ============================================================================
	# Operator
	# ============================================================================

	operator:
	name: Operator
	runs-on: prod-default-v2
	env:
	IMAGE_REGISTRY: ai-dynamo
	IMAGE_REPOSITORY: dynamo
	ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	outputs:
	operator_default_tag: ${{ steps.build-and-push-image.outputs.operator_default_tag }}
	steps:
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Initialize Dynamo Builder
	uses: ./.github/actions/init-dynamo-builder
	with:
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	flavor: general
	all_arch: 'true'
	- name: Docker Login
	uses: ./.github/actions/docker-login
	with:
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
	- name: Linter
	shell: bash
	working-directory: ./deploy/operator
	run: \|
	docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
	- name: Tester
	shell: bash
	working-directory: ./deploy/operator
	run: \|
	docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
	- name: Set up Go
	uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
	with:
	go-version: '1.25'
	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: "3.11"
	- name: Install Python dependencies for operator codegen
	shell: bash
	working-directory: ./deploy/operator
	run: \|
	python -m pip install --upgrade pip
	python -m pip install "pydantic>=2,<3" "black==23.1.0" "pyyaml>=6.0"
	- name: Check for uncommitted changes
	shell: bash
	working-directory: ./deploy/operator
	run: \|
	make check
	- name: Build and push Container
	id: build-and-push-image
	shell: bash
	working-directory: ./deploy/operator
	env:
	NO_CACHE_FLAG: '' # placeholder for future logic to add no cache flag if needed
	run: \|
	ECR_DEFAULT_IMAGE_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
	DEFAULT_TAG="${{ github.sha }}-operator"
	ACR_IMAGE_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
	IMAGE_URIS=(
	"${ECR_DEFAULT_IMAGE_BASE}:${DEFAULT_TAG}"
	"${ACR_IMAGE_BASE}:${DEFAULT_TAG}"
	)

	if [[ "${{ github.ref_name }}" == "main" ]]; then
	IMAGE_URIS+=(
	"${ECR_DEFAULT_IMAGE_BASE}:main-operator"
	"${ACR_IMAGE_BASE}:main-operator"
	)
	fi

	echo "operator_default_tag=${DEFAULT_TAG}" >> $GITHUB_OUTPUT
	TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}")
	echo "flags for docker buildx: ${TAGGING_FLAGS}"

	if [[ "$NO_CACHE_FLAG" == "true" ]]; then
	NO_CACHE_FLAG="--no-cache"
	fi
	docker buildx build --push ${NO_CACHE_FLAG} \
	--platform linux/amd64,linux/arm64 \
	--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
	${TAGGING_FLAGS} -f Dockerfile .

	echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "\| Image URI \|" >> $GITHUB_STEP_SUMMARY
	echo "\|-----\|" >> $GITHUB_STEP_SUMMARY
	for image_uri in "${IMAGE_URIS[@]}"; do
	echo "\| \`${image_uri}\` \|" >> $GITHUB_STEP_SUMMARY
	done

	# ============================================================================
	# DEPLOYMENT JOBS
	# Deploy operator and run end-to-end tests on Kubernetes cluster
	# ============================================================================

	deploy-operator:
	runs-on: prod-default-small-v2
	needs: [operator]
	outputs:
	NAMESPACE: ${{ steps.namespace.outputs.namespace }}
	OPERATOR_TAG: ${{ steps.operator-tag.outputs.tag }}
	steps:
	- uses: actions/checkout@v4
	- name: Determine operator tag
	id: operator-tag
	run: \|
	if [ "${{ needs.operator.result }}" == "success" ]; then
	TAG="${{ needs.operator.outputs.operator_default_tag }}"
	else
	TAG="main-operator"
	fi
	echo "tag=${TAG}" >> $GITHUB_OUTPUT
	echo "Using operator tag: ${TAG}"
	- name: Generate namespace name
	id: namespace
	env:
	BRANCH: ${{ github.ref_name }}
	run: \|
	# Sanitize branch name for k8s namespace
	# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
	BRANCH_SANITIZED="${BRANCH//\//-}"
	BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
	BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
	BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
	NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
	echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
	- name: Setup namespace and operator
	uses: ./.github/actions/setup-deploy-namespace
	with:
	kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
	namespace: ${{ steps.namespace.outputs.namespace }}
	registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
	operator_tag: ${{ steps.operator-tag.outputs.tag }}
	hf_token: ${{ secrets.HF_TOKEN }}

	# ============================================================================
	# End-to-end tests for each framework with various deployment profiles
	# ============================================================================

	deploy-test-vllm:
	runs-on: prod-default-small-v2
	needs: [deploy-operator, vllm-pipeline]
	timeout-minutes: 25
	permissions:
	contents: read
	strategy:
	fail-fast: false
	max-parallel: 2
	matrix:
	profile:
	- agg
	- agg_router
	- disagg
	- disagg_router
	name: deploy-test-vllm (${{ matrix.profile }})
	env:
	FRAMEWORK: vllm
	steps:
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Run Dynamo Deploy Test
	id: deploy-test
	uses: ./.github/actions/dynamo-deploy-test
	with:
	kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
	namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
	operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
	hf_token: ${{ secrets.HF_TOKEN }}
	framework: ${{ env.FRAMEWORK }}
	profile: ${{ matrix.profile }}
	image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64
	platform_arch: amd64

	deploy-test-sglang:
	runs-on: prod-default-small-v2
	needs: [deploy-operator, sglang-pipeline]
	timeout-minutes: 25
	permissions:
	contents: read
	strategy:
	fail-fast: false
	max-parallel: 2
	matrix:
	profile:
	- agg
	- agg_router
	name: deploy-test-sglang (${{ matrix.profile }})
	env:
	FRAMEWORK: sglang
	steps:
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Run Dynamo Deploy Test
	id: deploy-test
	uses: ./.github/actions/dynamo-deploy-test
	with:
	kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
	namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
	operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
	hf_token: ${{ secrets.HF_TOKEN }}
	framework: ${{ env.FRAMEWORK }}
	profile: ${{ matrix.profile }}
	image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-runtime-cuda12-amd64
	platform_arch: amd64

	deploy-test-trtllm:
	runs-on: prod-default-small-v2
	needs: [deploy-operator, trtllm-pipeline]
	timeout-minutes: 25
	permissions:
	contents: read
	strategy:
	fail-fast: false
	max-parallel: 2
	matrix:
	profile:
	- agg
	- agg_router
	# Disabled: trtllm disagg profiles consistently timeout (~32 min) with 0% success rate.
	# Re-enable once the underlying disagg deployment issue is resolved.
	# - disagg
	# - disagg_router
	name: deploy-test-trtllm (${{ matrix.profile }})
	env:
	FRAMEWORK: trtllm
	steps:
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Run Dynamo Deploy Test
	id: deploy-test
	uses: ./.github/actions/dynamo-deploy-test
	with:
	kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
	namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
	operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
	hf_token: ${{ secrets.HF_TOKEN }}
	framework: ${{ env.FRAMEWORK }}
	profile: ${{ matrix.profile }}
	image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-runtime-cuda13-amd64
	platform_arch: amd64

	deploy-status-check:
	runs-on: ubuntu-latest
	needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
	if: always()
	steps:
	- name: "Check all deploy test jobs"
	run: \|
	echo '${{ toJson(needs) }}' \| jq -e 'to_entries \| map(.value.result) \| all(. as $result \| ["success", "skipped"] \| any($result == .))'

	# ============================================================================
	# CLEANUP JOBS
	# ============================================================================

	clean-k8s-builder:
	name: Clean K8s builder if exists
	runs-on: prod-default-small-v2
	if: always()
	needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator]
	steps:
	- name: Checkout repository
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Create K8s builders (skip bootstrap)
	uses: ./.github/actions/bootstrap-buildkit
	continue-on-error: true
	with:
	builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
	buildkit_worker_addresses: '' # k8s builder
	skip_bootstrap: true
	- name: Builder Cleanup in case of k8s builder
	shell: bash
	run: \|
	docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} \|\| true

	cleanup:
	name: Cleanup AKS resources
	runs-on: prod-default-small-v2
	if: always()
	needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
	steps:
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Teardown namespace
	if: needs.deploy-operator.outputs.NAMESPACE != ''
	uses: ./.github/actions/teardown-deploy-namespace
	with:
	kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
	namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}

	############################## SLACK NOTIFICATION ##############################
	notify-slack:
	name: Notify Slack
	runs-on: prod-builder-amd-v1
	if: always() && failure()
	needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm ]
	permissions:
	contents: read
	steps:
	- name: Get Failed jobs
	shell: bash
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	JOBS_JSON=$(mktemp)
	curl -sSL \
	-H "Authorization: Bearer ${GITHUB_TOKEN}" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
	>$JOBS_JSON

	FAILED_JOBS=$(jq -r '.jobs[] \| select(.conclusion == "failure") \| ":failed: " + (.name \| split(" / ") \| .[-1]) + "\\n"' "$JOBS_JSON")
	echo $FAILED_JOBS
	{
	echo "FAILED_JOBS<<EOF"
	echo "$FAILED_JOBS"
	echo "EOF"
	} >> "$GITHUB_ENV"
	- name: Notify Slack
	uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
	with:
	webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
	webhook-type: incoming-webhook
	payload: \|
	blocks:
	- type: "section"
	text:
	type: mrkdwn
	text: ":alert: Github Post-merge Pipeline Failure"
	- type: "section"
	text:
	type: mrkdwn
	text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}\|Workflow Summary>"
	- type: "section"
	text:
	type: mrkdwn
	text: "${{ env.FAILED_JOBS }}"
	- type: "section"
	text:
	type: mrkdwn
	text: "@ops-support Please investigate the failures above."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

docs: use GitHub admonitions instead of Fern-native callouts (#7370) #696

Workflow file

docs: use GitHub admonitions instead of Fern-native callouts (#7370) #696

Uh oh!

Workflow file for this run