@@ -608,6 +608,15 @@ jobs:
608608 VLLM_MAX_NUM_SEQS : ${{ env.MAX_NUM_SEQS }}
609609 # Decode replicas for e2e testing (start with 1 replica, let HPA scale)
610610 DECODE_REPLICAS : " 1"
611+ # OpenShift uses built-in user-workload monitoring, not a separate namespace
612+ MONITORING_NAMESPACE : openshift-user-workload-monitoring
613+ # Disable bearer token auth on WVA /metrics endpoint — OpenShift's
614+ # user-workload-monitoring cannot authenticate with the controller-manager
615+ # SA token. The endpoint is still only accessible within the cluster network.
616+ WVA_METRICS_SECURE : " false"
617+ # inference-scheduling guide has routing proxy disabled, so vLLM
618+ # serves directly on port 8000 (not 8200 behind proxy)
619+ VLLM_SVC_PORT : " 8000"
611620 run : |
612621 echo "Deploying WVA and llm-d infrastructure..."
613622 echo " MODEL_ID: $MODEL_ID"
@@ -639,9 +648,38 @@ jobs:
639648 - name : Wait for infrastructure to be ready
640649 run : |
641650 echo "Waiting for WVA controller to be ready..."
642- kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" || true
651+ kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s || true
643652 kubectl get pods -n "$WVA_NAMESPACE"
644- echo "Waiting for llm-d deployment (Model A1) to be ready..."
653+
654+ # Ensure the vLLM deployment has the correct replica count.
655+ # A previous failed run's "Scale down GPU workloads" step may have set replicas=0
656+ # and helmfile doesn't override manually-changed replicas on re-deploy.
657+ # kubectl rollout status returns instantly on 0-replica deployments, so we must
658+ # ensure replicas > 0 before waiting.
659+ DESIRED_REPLICAS="${DECODE_REPLICAS:-1}"
660+ CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0")
661+ if [ "$CURRENT_REPLICAS" -eq 0 ]; then
662+ echo "WARNING: Model A1 deployment has 0 replicas (likely from previous failed run cleanup)"
663+ echo "Scaling to $DESIRED_REPLICAS replica(s)..."
664+ kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --replicas="$DESIRED_REPLICAS" || {
665+ echo "ERROR: Failed to scale Model A1 deployment"
666+ exit 1
667+ }
668+ fi
669+
670+ echo "Waiting for Model A1 vLLM deployment to be ready (up to 25 minutes for model loading)..."
671+ # kubectl rollout status waits for all replicas to be Ready, unlike
672+ # --for=condition=available which is satisfied even at 0 ready replicas.
673+ # vLLM model loading takes 15-20 minutes, so we use a 25-minute timeout.
674+ kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s || {
675+ echo "WARNING: Model A1 deployment not ready after 25 minutes"
676+ echo "=== Pod status ==="
677+ kubectl get pods -n "$LLMD_NAMESPACE"
678+ echo "=== Deployment conditions ==="
679+ kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.status.conditions}' | jq . || true
680+ echo "=== Recent events ==="
681+ kubectl get events -n "$LLMD_NAMESPACE" --sort-by='.lastTimestamp' | tail -20
682+ }
645683 kubectl get pods -n "$LLMD_NAMESPACE"
646684
647685 - name : Deploy Model B infrastructure in secondary namespace
@@ -666,6 +704,11 @@ jobs:
666704 VLLM_MAX_NUM_SEQS : ${{ env.MAX_NUM_SEQS }}
667705 # Decode replicas for e2e testing (start with 1 replica, let HPA scale)
668706 DECODE_REPLICAS : " 1"
707+ # OpenShift monitoring settings (same as Model A1 deploy)
708+ MONITORING_NAMESPACE : openshift-user-workload-monitoring
709+ WVA_METRICS_SECURE : " false"
710+ # Same port as Model A1 (inference-scheduling guide, proxy disabled)
711+ VLLM_SVC_PORT : " 8000"
669712 run : |
670713 echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
671714 echo " MODEL_ID: $MODEL_ID"
@@ -676,8 +719,9 @@ jobs:
676719 # Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
677720 ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift
678721
679- echo "Waiting for Model B deployment to be ready..."
680- kubectl wait --for=condition=available --timeout=300s deployment --all -n "$LLMD_NAMESPACE_B" || true
722+ echo "Waiting for Model B deployment to start (initial rollout)..."
723+ # Wait briefly for deployments to be created by helm before checking rollout status
724+ sleep 10
681725 kubectl get pods -n "$LLMD_NAMESPACE_B"
682726
683727 - name : Deploy Model B WVA resources
@@ -707,12 +751,41 @@ jobs:
707751 --set va.accelerator="$ACCELERATOR_TYPE" \
708752 --set wva.baseName="inference-scheduling" \
709753 --set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring \
754+ --set wva.metrics.secure=false \
755+ --set vllmService.port=8000 \
756+ --set vllmService.targetPort=8000 \
710757 --set wva.controllerInstance="$CONTROLLER_INSTANCE"
711758
712759 echo "Model B WVA resources deployed"
713760 kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
714761 kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
715762
763+ - name : Wait for Model B to be ready
764+ run : |
765+ # Same fix as Model A1: ensure replicas > 0 before waiting for rollout
766+ DESIRED_REPLICAS="${DECODE_REPLICAS:-1}"
767+ CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0")
768+ if [ "$CURRENT_REPLICAS" -eq 0 ]; then
769+ echo "WARNING: Model B deployment has 0 replicas (likely from previous failed run cleanup)"
770+ echo "Scaling to $DESIRED_REPLICAS replica(s)..."
771+ kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --replicas="$DESIRED_REPLICAS" || {
772+ echo "ERROR: Failed to scale Model B deployment"
773+ exit 1
774+ }
775+ fi
776+
777+ echo "Waiting for Model B vLLM deployment to be ready (up to 25 minutes for model loading)..."
778+ # Same as Model A1: use rollout status to wait for actual pod readiness.
779+ kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --timeout=1500s || {
780+ echo "WARNING: Model B deployment not ready after 25 minutes"
781+ echo "=== Pod status ==="
782+ kubectl get pods -n "$LLMD_NAMESPACE_B"
783+ echo "=== Deployment conditions ==="
784+ kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.status.conditions}' | jq . || true
785+ echo "=== Recent events ==="
786+ kubectl get events -n "$LLMD_NAMESPACE_B" --sort-by='.lastTimestamp' | tail -20
787+ }
788+
716789 - name : Verify multi-model deployment
717790 run : |
718791 echo "=== Multi-Model Deployment Status ==="
@@ -730,6 +803,117 @@ jobs:
730803 echo "=== WVA Controller ($WVA_NAMESPACE) ==="
731804 kubectl get pods -n "$WVA_NAMESPACE"
732805
806+ - name : Verify metrics pipeline
807+ run : |
808+ echo "=== Verifying metrics pipeline before running tests ==="
809+ echo ""
810+
811+ # 1. Verify vLLM pods are serving /metrics endpoint
812+ echo "--- Step 1: Checking vLLM /metrics endpoint ---"
813+ for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
814+ VLLM_POD=$(kubectl get pods -n "$ns" -l llm-d.ai/inference-serving=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
815+ if [ -n "$VLLM_POD" ]; then
816+ PORT="${VLLM_SVC_PORT:-8000}"
817+ echo " Checking vLLM pod $VLLM_POD in $ns (port $PORT)..."
818+ METRICS=$(kubectl exec -n "$ns" "$VLLM_POD" -- curl -s "http://localhost:${PORT}/metrics" 2>/dev/null | head -5 || true)
819+ if [ -n "$METRICS" ]; then
820+ echo " ✅ vLLM metrics endpoint responding in $ns"
821+ else
822+ echo " ⚠️ vLLM metrics endpoint not responding in $ns (may still be loading)"
823+ fi
824+ # Show pod labels for debugging
825+ echo " Pod labels:"
826+ kubectl get pod "$VLLM_POD" -n "$ns" -o jsonpath='{.metadata.labels}' | jq -r 'to_entries[] | " \(.key)=\(.value)"' 2>/dev/null || true
827+ else
828+ echo " ⚠️ No vLLM pods found with label llm-d.ai/inference-serving=true in $ns"
829+ echo " All pods in $ns:"
830+ kubectl get pods -n "$ns" --show-labels 2>/dev/null || true
831+ fi
832+ done
833+
834+ # 1b. Verify vllm-service has endpoints (critical for ServiceMonitor scraping)
835+ echo ""
836+ echo "--- Step 1b: Checking vllm-service endpoints ---"
837+ for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
838+ SVC_NAME=$(kubectl get svc -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
839+ if [ -n "$SVC_NAME" ]; then
840+ ENDPOINTS=$(kubectl get endpoints "$SVC_NAME" -n "$ns" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)
841+ if [ -n "$ENDPOINTS" ]; then
842+ echo " ✅ Service $SVC_NAME in $ns has endpoints: $ENDPOINTS"
843+ else
844+ echo " ❌ Service $SVC_NAME in $ns has NO endpoints — label selector mismatch!"
845+ echo " Service selector:"
846+ kubectl get svc "$SVC_NAME" -n "$ns" -o jsonpath='{.spec.selector}' 2>/dev/null | jq . || true
847+ fi
848+ else
849+ echo " ⚠️ No vllm-service found in $ns"
850+ fi
851+ done
852+
853+ # 1c. Check PodMonitors (llm-d guide deploys these for direct pod scraping)
854+ echo ""
855+ echo "--- Step 1c: PodMonitor configuration ---"
856+ for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
857+ PM_COUNT=$(kubectl get podmonitor -n "$ns" --no-headers 2>/dev/null | wc -l | tr -d ' ')
858+ echo " PodMonitors in $ns: $PM_COUNT"
859+ kubectl get podmonitor -n "$ns" 2>/dev/null || true
860+ done
861+
862+ # 2. Check WVA controller health
863+ echo ""
864+ echo "--- Step 2: WVA controller status ---"
865+ kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler
866+ WVA_POD=$(kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
867+ if [ -n "$WVA_POD" ]; then
868+ echo " Recent WVA controller logs:"
869+ kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=20 | grep -E "reconcil|metrics|error|saturation" || echo " (no matching log lines)"
870+ fi
871+
872+ # 3. Check VariantAutoscaling status
873+ echo ""
874+ echo "--- Step 3: VariantAutoscaling status ---"
875+ kubectl get variantautoscaling -A -o wide 2>/dev/null || echo " No VariantAutoscalings found"
876+
877+ # 4. Check ServiceMonitors exist
878+ echo ""
879+ echo "--- Step 4: ServiceMonitor configuration ---"
880+ for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do
881+ SM_COUNT=$(kubectl get servicemonitor -n "$ns" --no-headers 2>/dev/null | wc -l | tr -d ' ')
882+ echo " ServiceMonitors in $ns: $SM_COUNT"
883+ kubectl get servicemonitor -n "$ns" 2>/dev/null || true
884+ done
885+
886+ # 5. Wait for WVA to start processing metrics (up to 3 minutes)
887+ echo ""
888+ echo "--- Step 5: Waiting for WVA to detect metrics (up to 3 minutes) ---"
889+ METRICS_READY=false
890+ for i in $(seq 1 18); do
891+ VA_STATUS=$(kubectl get variantautoscaling -n "$LLMD_NAMESPACE" -o jsonpath='{.items[0].status.desiredOptimizedAlloc.accelerator}' 2>/dev/null || true)
892+ if [ -n "$VA_STATUS" ]; then
893+ echo " ✅ WVA optimization active — accelerator: $VA_STATUS"
894+ METRICS_READY=true
895+ break
896+ fi
897+ echo " Attempt $i/18: WVA not yet optimizing, waiting 10s..."
898+ sleep 10
899+ done
900+
901+ if [ "$METRICS_READY" = "false" ]; then
902+ echo " ⚠️ WVA has not started optimizing after 3 minutes"
903+ echo " This may cause test timeouts — dumping diagnostics:"
904+ echo ""
905+ echo " === WVA controller logs (last 50 lines) ==="
906+ kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=50 2>/dev/null || true
907+ echo ""
908+ echo " === HPA status ==="
909+ kubectl get hpa -A 2>/dev/null || true
910+ echo ""
911+ echo " Continuing to tests anyway (they have their own timeouts)..."
912+ fi
913+
914+ echo ""
915+ echo "=== Metrics pipeline verification complete ==="
916+
733917 - name : Install Go dependencies
734918 run : go mod download
735919
0 commit comments