@@ -304,6 +304,145 @@ jobs:
304304 test_type : " e2e, gpu_1"
305305 platform_arch : ${{ matrix.platform.arch }}
306306
307+ deploy-test-fault-tolerance :
308+ runs-on : cpu-amd-m5-2xlarge
309+ if : needs.changed-files.outputs.has_code_changes == 'true'
310+ needs : [changed-files, operator, vllm, trtllm, sglang]
311+ permissions :
312+ contents : read
313+ strategy :
314+ fail-fast : false
315+ # Run matrix jobs sequentially to prevent a Helm race condition
316+ # Parallel jobs conflict on ClusterRole ownership when installing the chart.
317+ # Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
318+ max-parallel : 1
319+ matrix :
320+ framework :
321+ - { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
322+ - { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
323+ - { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
324+ name : deploy-test-fault-tolerance (${{ matrix.framework.name }})
325+ env :
326+ DYNAMO_INGRESS_SUFFIX : dev.aire.nvidia.com
327+ steps :
328+ - name : Checkout code
329+ uses : actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
330+ - name : Set namespace
331+ run : |
332+ # Set namespace using test scenario
333+ export FRAMEWORK=${{ matrix.framework.name }}
334+ echo "NAMESPACE=gh-job-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
335+ set -x
336+
337+ # Setup kubeconfig
338+ echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
339+ chmod 600 .kubeconfig
340+ export KUBECONFIG=$(pwd)/.kubeconfig
341+ kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
342+ kubectl config current-context
343+ - name : Deploy Operator
344+ run : |
345+ set -x
346+ export KUBECONFIG=$(pwd)/.kubeconfig
347+
348+ # Create a namespace for this job
349+ echo "Creating an ephemeral namespace..."
350+ kubectl delete namespace $NAMESPACE || true
351+ kubectl create namespace $NAMESPACE || true
352+ echo "Attaching the labels for secrets and cleanup"
353+ kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
354+
355+ # Set the namespace as default
356+ kubectl config set-context --current --namespace=$NAMESPACE
357+
358+ # Check if Istio is installed
359+ kubectl get pods -n istio-system
360+ # Check if default storage class exists
361+ kubectl get storageclass
362+
363+ # Install Helm chart
364+ export VIRTUAL_ENV=/opt/dynamo/venv
365+ export KUBE_NS=$NAMESPACE
366+ export ISTIO_ENABLED=true
367+ export ISTIO_GATEWAY=istio-system/ingress-alb
368+ export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
369+ export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
370+
371+ # Install dynamo env secrets
372+ kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
373+ # Create docker pull secret for operator image
374+ kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
375+ # Install helm dependencies
376+ helm repo add bitnami https://charts.bitnami.com/bitnami
377+ cd deploy/cloud/helm/platform/
378+ helm dep build .
379+ # Install platform with namespace restriction for single profile testing
380+ helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
381+ --set dynamo-operator.namespaceRestriction.enabled=true \
382+ --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
383+ --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
384+ --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
385+ --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
386+ --timeout 10m --wait
387+ # Wait for all deployments to be ready
388+ timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
389+ cd -
390+
391+ export KUBECONFIG=$(pwd)/.kubeconfig
392+ kubectl config set-context --current --namespace=$NAMESPACE
393+ - name : Run Fault Tolerance Tests
394+ run : |
395+ set -x
396+ export KUBECONFIG=$(pwd)/.kubeconfig
397+ export NAMESPACE=$NAMESPACE
398+ export FRAMEWORK=${{ matrix.framework.name }}
399+ export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
400+
401+ echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
402+ echo "Using namespace: $NAMESPACE"
403+ echo "Using image: $IMAGE"
404+
405+ # Install python3-venv package if not already installed
406+ sudo apt-get update && sudo apt-get install -y python3-venv
407+
408+ # Set up Python virtual environment and install test dependencies
409+ python3 -m venv venv
410+ source venv/bin/activate
411+ pip install --upgrade pip
412+ pip install -r container/deps/requirements.test.txt
413+ pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
414+
415+ # Run the pytest command (tests orchestrate K8s, don't need dynamo package)
416+ pytest tests/fault_tolerance/deploy/test_deployment.py \
417+ -m 'k8s and fault_tolerance' \
418+ -k '${{ matrix.framework.test_scenario }}' \
419+ -s -v \
420+ --namespace ${NAMESPACE} \
421+ --image ${IMAGE} \
422+ --client-type legacy
423+ - name : Cleanup
424+ if : always()
425+ timeout-minutes : 5
426+ run : |
427+ echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
428+ chmod 600 .kubeconfig
429+ export KUBECONFIG=$(pwd)/.kubeconfig
430+ kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
431+
432+ # For debugging purposes, list all the resources before we uninstall
433+ kubectl get all
434+
435+ echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
436+ kubectl delete dynamographdeployments --all -n $NAMESPACE || true
437+
438+ # Uninstall the helm chart
439+ helm ls
440+ helm uninstall dynamo-platform || true
441+
442+ echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
443+ kubectl delete namespace $NAMESPACE || true
444+ echo "Namespace $NAMESPACE completed."
445+
307446 # Upload metrics for this workflow and all its jobs
308447 upload-workflow-metrics :
309448 name : Upload Workflow Metrics
0 commit comments