Skip to content

Commit d1cf3c2

Browse files
authored
ci: Add Fault Tolerance K8s test (#3801)
Signed-off-by: Indrajit Bhosale <[email protected]>
1 parent 3ff5fa1 commit d1cf3c2

File tree

5 files changed

+163
-5
lines changed

5 files changed

+163
-5
lines changed

.github/workflows/container-validation-backends.yml

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,145 @@ jobs:
304304
test_type: "e2e, gpu_1"
305305
platform_arch: ${{ matrix.platform.arch }}
306306

307+
deploy-test-fault-tolerance:
308+
runs-on: cpu-amd-m5-2xlarge
309+
if: needs.changed-files.outputs.has_code_changes == 'true'
310+
needs: [changed-files, operator, vllm, trtllm, sglang]
311+
permissions:
312+
contents: read
313+
strategy:
314+
fail-fast: false
315+
# Run matrix jobs sequentially to prevent a Helm race condition
316+
# Parallel jobs conflict on ClusterRole ownership when installing the chart.
317+
# Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
318+
max-parallel: 1
319+
matrix:
320+
framework:
321+
- { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
322+
- { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
323+
- { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
324+
name: deploy-test-fault-tolerance (${{ matrix.framework.name }})
325+
env:
326+
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
327+
steps:
328+
- name: Checkout code
329+
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
330+
- name: Set namespace
331+
run: |
332+
# Set namespace using test scenario
333+
export FRAMEWORK=${{ matrix.framework.name }}
334+
echo "NAMESPACE=gh-job-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
335+
set -x
336+
337+
# Setup kubeconfig
338+
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
339+
chmod 600 .kubeconfig
340+
export KUBECONFIG=$(pwd)/.kubeconfig
341+
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
342+
kubectl config current-context
343+
- name: Deploy Operator
344+
run: |
345+
set -x
346+
export KUBECONFIG=$(pwd)/.kubeconfig
347+
348+
# Create a namespace for this job
349+
echo "Creating an ephemeral namespace..."
350+
kubectl delete namespace $NAMESPACE || true
351+
kubectl create namespace $NAMESPACE || true
352+
echo "Attaching the labels for secrets and cleanup"
353+
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
354+
355+
# Set the namespace as default
356+
kubectl config set-context --current --namespace=$NAMESPACE
357+
358+
# Check if Istio is installed
359+
kubectl get pods -n istio-system
360+
# Check if default storage class exists
361+
kubectl get storageclass
362+
363+
# Install Helm chart
364+
export VIRTUAL_ENV=/opt/dynamo/venv
365+
export KUBE_NS=$NAMESPACE
366+
export ISTIO_ENABLED=true
367+
export ISTIO_GATEWAY=istio-system/ingress-alb
368+
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
369+
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
370+
371+
# Install dynamo env secrets
372+
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
373+
# Create docker pull secret for operator image
374+
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
375+
# Install helm dependencies
376+
helm repo add bitnami https://charts.bitnami.com/bitnami
377+
cd deploy/cloud/helm/platform/
378+
helm dep build .
379+
# Install platform with namespace restriction for single profile testing
380+
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
381+
--set dynamo-operator.namespaceRestriction.enabled=true \
382+
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
383+
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
384+
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
385+
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
386+
--timeout 10m --wait
387+
# Wait for all deployments to be ready
388+
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
389+
cd -
390+
391+
export KUBECONFIG=$(pwd)/.kubeconfig
392+
kubectl config set-context --current --namespace=$NAMESPACE
393+
- name: Run Fault Tolerance Tests
394+
run: |
395+
set -x
396+
export KUBECONFIG=$(pwd)/.kubeconfig
397+
export NAMESPACE=$NAMESPACE
398+
export FRAMEWORK=${{ matrix.framework.name }}
399+
export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
400+
401+
echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
402+
echo "Using namespace: $NAMESPACE"
403+
echo "Using image: $IMAGE"
404+
405+
# Install python3-venv package if not already installed
406+
sudo apt-get update && sudo apt-get install -y python3-venv
407+
408+
# Set up Python virtual environment and install test dependencies
409+
python3 -m venv venv
410+
source venv/bin/activate
411+
pip install --upgrade pip
412+
pip install -r container/deps/requirements.test.txt
413+
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
414+
415+
# Run the pytest command (tests orchestrate K8s, don't need dynamo package)
416+
pytest tests/fault_tolerance/deploy/test_deployment.py \
417+
-m 'k8s and fault_tolerance' \
418+
-k '${{ matrix.framework.test_scenario }}' \
419+
-s -v \
420+
--namespace ${NAMESPACE} \
421+
--image ${IMAGE} \
422+
--client-type legacy
423+
- name: Cleanup
424+
if: always()
425+
timeout-minutes: 5
426+
run: |
427+
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
428+
chmod 600 .kubeconfig
429+
export KUBECONFIG=$(pwd)/.kubeconfig
430+
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
431+
432+
# For debugging purposes, list all the resources before we uninstall
433+
kubectl get all
434+
435+
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
436+
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
437+
438+
# Uninstall the helm chart
439+
helm ls
440+
helm uninstall dynamo-platform || true
441+
442+
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
443+
kubectl delete namespace $NAMESPACE || true
444+
echo "Namespace $NAMESPACE completed."
445+
307446
# Upload metrics for this workflow and all its jobs
308447
upload-workflow-metrics:
309448
name: Upload Workflow Metrics

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,9 @@ markers = [
200200
"h100: marks tests to run on H100",
201201
"kvbm: marks tests for KV behavior and model determinism",
202202
"model: model id used by a test or parameter",
203-
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)"
203+
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
204+
"k8s: marks tests as requiring Kubernetes",
205+
"fault_tolerance: marks tests as fault tolerance tests"
204206
]
205207

206208
# Linting/formatting

tests/fault_tolerance/deploy/legacy_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def client(
274274
)
275275

276276
# Log result
277-
logger.info(
277+
logger.debug(
278278
f"Request: {i} Pod {pod_name} Local Port {port} "
279279
f"Status: {result['results'][-1]['status']} "
280280
f"Latency: {result['results'][-1]['request_elapsed_time']}"

tests/fault_tolerance/deploy/test_deployment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ def results_summary():
340340
logging.error(f"Failed to parse combined results: {e}")
341341

342342

343+
@pytest.mark.k8s
344+
@pytest.mark.fault_tolerance
343345
@pytest.mark.e2e
344346
@pytest.mark.slow
345347
@pytest.mark.filterwarnings("ignore::DeprecationWarning")

tests/utils/managed_deployment.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,22 @@
1818
from kr8s.objects import Service as kr8s_Service
1919
from kubernetes_asyncio import client, config
2020

21-
from dynamo.common.utils.paths import get_workspace_dir
21+
22+
def _get_workspace_dir() -> str:
23+
"""Get workspace directory without depending on dynamo.common package.
24+
25+
This allows tests to run without requiring dynamo package to be installed.
26+
"""
27+
# Start from this file's location and walk up to find workspace root
28+
current = os.path.dirname(os.path.abspath(__file__))
29+
while current != os.path.dirname(current): # Stop at filesystem root
30+
# Workspace root has pyproject.toml
31+
if os.path.exists(os.path.join(current, "pyproject.toml")):
32+
return current
33+
current = os.path.dirname(current)
34+
35+
# Fallback: assume workspace is 3 levels up from tests/utils/
36+
return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
2237

2338

2439
class ServiceSpec:
@@ -877,8 +892,8 @@ async def main():
877892
datefmt=DATE_FORMAT, # ISO 8601 UTC format
878893
)
879894

880-
# Get workspace directory using centralized logic
881-
workspace_dir = get_workspace_dir()
895+
# Get workspace directory
896+
workspace_dir = _get_workspace_dir()
882897

883898
deployment_spec = DeploymentSpec(
884899
os.path.join(workspace_dir, "examples/backends/vllm/deploy/agg.yaml")

0 commit comments

Comments
 (0)