Skip to content

Commit 9d76583

Browse files
ci: OPS-1745: Deploy operator once for deploy tests (#4022)
Signed-off-by: Dillon Cullinan <[email protected]> Signed-off-by: Dillon Cullinan <[email protected]>
1 parent b73e6eb commit 9d76583

File tree

1 file changed

+110
-48
lines changed

1 file changed

+110
-48
lines changed

.github/workflows/container-validation-backends.yml

Lines changed: 110 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -341,51 +341,38 @@ jobs:
341341
# Upload complete workflow metrics including container metrics
342342
python3 .github/workflows/upload_complete_workflow_metrics.py
343343
344-
deploy-test-vllm:
344+
deploy-operator:
345345
runs-on: cpu-amd-m5-2xlarge
346346
if: needs.changed-files.outputs.has_code_changes == 'true'
347-
needs: [changed-files, operator, vllm]
348-
permissions:
349-
contents: read
350-
strategy:
351-
fail-fast: false
352-
matrix:
353-
profile:
354-
- agg
355-
- agg_router
356-
- disagg
357-
- disagg_router
358-
name: deploy-test-vllm (${{ matrix.profile }})
347+
needs: [changed-files, operator]
359348
env:
360-
FRAMEWORK: vllm
361349
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
362-
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
363-
MODEL_NAME: "Qwen/Qwen3-0.6B"
364-
steps: &deploy-test-steps
350+
outputs:
351+
NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }}
352+
steps:
365353
- uses: actions/checkout@v4
366-
- name: Set namespace
354+
- name: Deploy Operator
355+
id: deploy-operator-step
356+
env:
357+
BRANCH: ${{ github.ref_name }}
367358
run: |
368-
# Set namespace using FRAMEWORK env var
369-
PROFILE_SANITIZED="${{ matrix.profile }}"
370-
PROFILE_SANITIZED="${PROFILE_SANITIZED//_/-}"
371-
echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV
372-
373359
set -x
360+
361+
# Set namespace using branch
362+
BRANCH_SANITIZED="${BRANCH/\//-}"
363+
NAMESPACE="gh-job-id-${{ github.run_id }}-${BRANCH_SANITIZED}-deploy-tests"
364+
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
365+
374366
# Setup kubeconfig
375367
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
376368
chmod 600 .kubeconfig
377369
export KUBECONFIG=$(pwd)/.kubeconfig
378370
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
379371
kubectl config current-context
380-
- name: Deploy Operator
381-
run: |
382-
set -x
383-
export KUBECONFIG=$(pwd)/.kubeconfig
384372
385373
# Create a namespace for this job
386374
echo "Creating an ephemeral namespace..."
387-
kubectl delete namespace $NAMESPACE || true
388-
kubectl create namespace $NAMESPACE || true
375+
kubectl create namespace $NAMESPACE
389376
echo "Attaching the labels for secrets and cleanup"
390377
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
391378
@@ -398,8 +385,6 @@ jobs:
398385
kubectl get storageclass
399386
400387
# Install Helm chart
401-
export IMAGE_TAG=$(cat build.env)
402-
echo $IMAGE_TAG
403388
export VIRTUAL_ENV=/opt/dynamo/venv
404389
export KUBE_NS=$NAMESPACE
405390
export ISTIO_ENABLED=true
@@ -424,8 +409,45 @@ jobs:
424409
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
425410
# Wait for all deployments to be ready
426411
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
427-
cd -
428412
413+
deploy-test-vllm:
414+
runs-on: cpu-amd-m5-2xlarge
415+
if: needs.changed-files.outputs.has_code_changes == 'true'
416+
needs: [changed-files, deploy-operator, vllm]
417+
permissions:
418+
contents: read
419+
strategy:
420+
fail-fast: false
421+
matrix:
422+
profile:
423+
- agg
424+
- agg_router
425+
- disagg
426+
- disagg_router
427+
name: deploy-test-vllm (${{ matrix.profile }})
428+
env:
429+
FRAMEWORK: vllm
430+
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
431+
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
432+
MODEL_NAME: "Qwen/Qwen3-0.6B"
433+
steps: &deploy-test-steps
434+
- uses: actions/checkout@v4
435+
- name: Setup Kubeconfig
436+
env:
437+
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
438+
run: |
439+
set -x
440+
# Setup kubeconfig
441+
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
442+
chmod 600 .kubeconfig
443+
export KUBECONFIG=$(pwd)/.kubeconfig
444+
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
445+
kubectl config get-contexts
446+
- name: Run Tests
447+
env:
448+
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
449+
run: |
450+
set -x
429451
export KUBECONFIG=$(pwd)/.kubeconfig
430452
kubectl config set-context --current --namespace=$NAMESPACE
431453
@@ -523,30 +545,25 @@ jobs:
523545
- name: Cleanup
524546
if: always()
525547
timeout-minutes: 5
548+
env:
549+
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
550+
PROFILE: ${{ matrix.profile }}
526551
run: |
527-
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
528-
chmod 600 .kubeconfig
552+
set -x
529553
export KUBECONFIG=$(pwd)/.kubeconfig
530-
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
554+
kubectl config set-context --current --namespace=$NAMESPACE
531555
532-
# For debugging purposes, list all the resources before we uninstall
556+
# For debugging purposes, list all the resources before we delete
533557
kubectl get all
534558
535-
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
536-
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
537-
538-
# Uninstall the helm chart
539-
helm ls
540-
helm uninstall dynamo-platform || true
541-
542-
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
543-
kubectl delete namespace $NAMESPACE || true
544-
echo "Namespace $NAMESPACE completed."
559+
echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..."
560+
PROFILE_SANITIZED="${PROFILE/_/-}"
561+
kubectl delete dynamographdeployments $FRAMEWORK-$PROFILE_SANITIZED -n $NAMESPACE || true
545562
546563
deploy-test-sglang:
547564
runs-on: cpu-amd-m5-2xlarge
548565
if: needs.changed-files.outputs.has_code_changes == 'true'
549-
needs: [changed-files, operator, sglang]
566+
needs: [changed-files, deploy-operator, sglang]
550567
permissions:
551568
contents: read
552569
strategy:
@@ -566,7 +583,7 @@ jobs:
566583
deploy-test-trtllm:
567584
runs-on: cpu-amd-m5-2xlarge
568585
if: needs.changed-files.outputs.has_code_changes == 'true'
569-
needs: [changed-files, operator, trtllm]
586+
needs: [changed-files, deploy-operator, trtllm]
570587
permissions:
571588
contents: read
572589
strategy:
@@ -584,3 +601,48 @@ jobs:
584601
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
585602
MODEL_NAME: "Qwen/Qwen3-0.6B"
586603
steps: *deploy-test-steps
604+
605+
cleanup:
606+
runs-on: cpu-amd-m5-2xlarge
607+
if: always()
608+
needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
609+
steps:
610+
- uses: actions/checkout@v4
611+
- name: Setup Kubeconfig
612+
env:
613+
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
614+
run: |
615+
set -x
616+
# Setup kubeconfig
617+
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
618+
chmod 600 .kubeconfig
619+
export KUBECONFIG=$(pwd)/.kubeconfig
620+
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
621+
kubectl config current-context
622+
- name: Cleanup
623+
timeout-minutes: 5
624+
env:
625+
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
626+
run: |
627+
set -x
628+
export KUBECONFIG=$(pwd)/.kubeconfig
629+
kubectl config set-context --current --namespace=$NAMESPACE
630+
631+
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
632+
chmod 600 .kubeconfig
633+
export KUBECONFIG=$(pwd)/.kubeconfig
634+
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
635+
636+
# For debugging purposes, list all the resources before we uninstall
637+
kubectl get all
638+
639+
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
640+
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
641+
642+
# Uninstall the helm chart
643+
helm ls
644+
helm uninstall dynamo-platform --namespace $NAMESPACE || true
645+
646+
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
647+
kubectl delete namespace $NAMESPACE || true
648+
echo "Namespace $NAMESPACE completed."

0 commit comments

Comments
 (0)