Fix VA stabilization to wait for exact minReplicas before load test

clubanderson · claude · clubanderson · commit 97bd05a47c19 · 2025-12-22T09:55:22.000-05:00
- Move HPA retrieval before VA stabilization check to know minReplicas - Change stabilization check from >= 1 to == hpaMinReplicas - Increase stabilization timeout to 5 minutes with 10s intervals - This ensures initial state is captured at baseline, not at residual scale - Disable cleanup for testing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/.github/workflows/ci-e2e-openshift.yaml b/.github/workflows/ci-e2e-openshift.yaml
@@ -131,7 +131,7 @@ jobs:
       NUM_PROMPTS: ${{ github.event.inputs.num_prompts || '3000' }}
       MAX_NUM_SEQS: ${{ github.event.inputs.max_num_seqs || '1' }}
       HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds || '30' }}
-      SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }}
+      SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'true' }}
       # PR-specific namespaces for isolation between concurrent PR tests
       # Primary llm-d namespace (Model A1 + A2)
       LLMD_NAMESPACE: llm-d-inference-scheduler-pr-${{ github.event.pull_request.number || github.run_id }}
diff --git a/test/e2e-openshift/sharegpt_scaleup_test.go b/test/e2e-openshift/sharegpt_scaleup_test.go
@@ -145,6 +145,38 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 				initialReplicas = deploy.Status.ReadyReplicas
 				_, _ = fmt.Fprintf(GinkgoWriter, "Initial ready replicas: %d\n", initialReplicas)
 
+				// Get HPA first to know minReplicas for VA stabilization check
+				By("verifying HPA exists and getting minReplicas")
+				hpaList, err := k8sClient.AutoscalingV2().HorizontalPodAutoscalers(model.namespace).List(ctx, metav1.ListOptions{
+					LabelSelector: "app.kubernetes.io/name=workload-variant-autoscaler",
+				})
+				Expect(err).NotTo(HaveOccurred(), "Should be able to list HPAs")
+				Expect(hpaList.Items).NotTo(BeEmpty(), "At least one WVA HPA should exist")
+
+				// Select the HPA that targets the expected deployment
+				var hpa *autoscalingv2.HorizontalPodAutoscaler
+				for i := range hpaList.Items {
+					if hpaList.Items[i].Spec.ScaleTargetRef.Name == model.deployment {
+						hpa = &hpaList.Items[i]
+						break
+					}
+				}
+				Expect(hpa).NotTo(BeNil(), "An HPA targeting deployment %s should exist", model.deployment)
+				hpaName = hpa.Name
+				hpaMinReplicas = *hpa.Spec.MinReplicas
+				_, _ = fmt.Fprintf(GinkgoWriter, "Found HPA: %s (targets %s, minReplicas=%d)\n", hpaName, model.deployment, hpaMinReplicas)
+
+				Expect(hpa.Spec.Metrics).To(HaveLen(1), "HPA should have one metric")
+				Expect(hpa.Spec.Metrics[0].Type).To(Equal(autoscalingv2.ExternalMetricSourceType), "HPA should use external metrics")
+				Expect(hpa.Spec.Metrics[0].External.Metric.Name).To(Equal(constants.InfernoDesiredReplicas), "HPA should use inferno_desired_replicas metric")
+
+				By("verifying gateway service exists for load routing")
+				// Traffic goes through the Istio gateway to be properly routed via InferencePool/EPP
+				// The gateway service is created by the llm-d-infra chart
+				gatewaySvc, err := k8sClient.CoreV1().Services(model.namespace).Get(ctx, model.gatewayService, metav1.GetOptions{})
+				Expect(err).NotTo(HaveOccurred(), "Gateway service %s should exist in namespace %s", model.gatewayService, model.namespace)
+				_, _ = fmt.Fprintf(GinkgoWriter, "Found gateway service: %s (ClusterIP: %s)\n", gatewaySvc.Name, gatewaySvc.Spec.ClusterIP)
+
 				By("recording initial VariantAutoscaling state")
 				vaList := &v1alpha1.VariantAutoscalingList{}
 				err = crClient.List(ctx, vaList, client.InNamespace(model.namespace), client.MatchingLabels{
@@ -166,7 +198,7 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 				_, _ = fmt.Fprintf(GinkgoWriter, "Found VariantAutoscaling: %s (targets %s)\n", vaName, model.deployment)
 
 				// Wait for VA to stabilize at minReplicas before recording initial state
-				// This ensures we're measuring scale-up from load, not initial startup
+				// This ensures we're measuring scale-up from load, not residual scale from prior activity
 				By("waiting for VA to stabilize at minReplicas")
 				Eventually(func(g Gomega) {
 					currentVA := &v1alpha1.VariantAutoscaling{}
@@ -176,9 +208,9 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 					}, currentVA)
 					g.Expect(err).NotTo(HaveOccurred())
 					optimized := int32(currentVA.Status.DesiredOptimizedAlloc.NumReplicas)
-					_, _ = fmt.Fprintf(GinkgoWriter, "Waiting for VA to stabilize: optimized=%d, minReplicas=%d\n", optimized, hpaMinReplicas)
-					g.Expect(optimized).To(BeNumerically(">=", 1), "VA should have at least 1 optimized replica")
-				}, 2*time.Minute, 5*time.Second).Should(Succeed())
+					_, _ = fmt.Fprintf(GinkgoWriter, "Waiting for VA to stabilize: optimized=%d, target minReplicas=%d\n", optimized, hpaMinReplicas)
+					g.Expect(optimized).To(Equal(hpaMinReplicas), "VA should stabilize at minReplicas before load test")
+				}, 5*time.Minute, 10*time.Second).Should(Succeed())
 
 				// Re-read VA to get stabilized state
 				err = crClient.Get(ctx, client.ObjectKey{
@@ -188,39 +220,6 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
 				Expect(err).NotTo(HaveOccurred())
 				initialOptimized = int32(va.Status.DesiredOptimizedAlloc.NumReplicas)
 				_, _ = fmt.Fprintf(GinkgoWriter, "Initial optimized replicas (after stabilization): %d\n", initialOptimized)
-
-				By("verifying HPA exists and is configured correctly")
-				hpaList, err := k8sClient.AutoscalingV2().HorizontalPodAutoscalers(model.namespace).List(ctx, metav1.ListOptions{
-					LabelSelector: "app.kubernetes.io/name=workload-variant-autoscaler",
-				})
-				Expect(err).NotTo(HaveOccurred(), "Should be able to list HPAs")
-				Expect(hpaList.Items).NotTo(BeEmpty(), "At least one WVA HPA should exist")
-
-				// Select the HPA that targets the expected deployment
-				var hpa *autoscalingv2.HorizontalPodAutoscaler
-				for i := range hpaList.Items {
-					if hpaList.Items[i].Spec.ScaleTargetRef.Name == model.deployment {
-						hpa = &hpaList.Items[i]
-						break
-					}
-				}
-				Expect(hpa).NotTo(BeNil(), "An HPA targeting deployment %s should exist", model.deployment)
-				hpaName = hpa.Name
-				_, _ = fmt.Fprintf(GinkgoWriter, "Found HPA: %s (targets %s)\n", hpaName, model.deployment)
-
-				By("verifying gateway service exists for load routing")
-				// Traffic goes through the Istio gateway to be properly routed via InferencePool/EPP
-				// The gateway service is created by the llm-d-infra chart
-				gatewaySvc, err := k8sClient.CoreV1().Services(model.namespace).Get(ctx, model.gatewayService, metav1.GetOptions{})
-				Expect(err).NotTo(HaveOccurred(), "Gateway service %s should exist in namespace %s", model.gatewayService, model.namespace)
-				_, _ = fmt.Fprintf(GinkgoWriter, "Found gateway service: %s (ClusterIP: %s)\n", gatewaySvc.Name, gatewaySvc.Spec.ClusterIP)
-
-				Expect(hpa.Spec.Metrics).To(HaveLen(1), "HPA should have one metric")
-				Expect(hpa.Spec.Metrics[0].Type).To(Equal(autoscalingv2.ExternalMetricSourceType), "HPA should use external metrics")
-				Expect(hpa.Spec.Metrics[0].External.Metric.Name).To(Equal(constants.InfernoDesiredReplicas), "HPA should use inferno_desired_replicas metric")
-
-				hpaMinReplicas = *hpa.Spec.MinReplicas
-				_, _ = fmt.Fprintf(GinkgoWriter, "HPA minReplicas: %d\n", hpaMinReplicas)
 			})
 
 			It("should verify external metrics API is accessible", func() {