feat(e2e): multi-model testing with 2 models in 2 namespaces, 1 shared WVA

clubanderson · claude · clubanderson · commit e25066cf45b6 · 2025-12-22T10:18:58.000-05:00
Multi-model E2E Testing: - Deploy 2 models in 2 namespaces with 1 shared WVA controller - Model A in llm-d-inference-scheduler-pr-XXX - Model B in llm-d-inference-scheduler-pr-XXX-b - Shared WVA in llm-d-autoscaler-pr-XXX Test Improvements: - Move HPA retrieval before VA stabilization to know minReplicas - Wait for VA to stabilize at exact minReplicas before load test - Increase stabilization timeout to 5 minutes - Route load through Istio gateway instead of direct vLLM service CI Cleanup Behavior: - Before tests: Clean up all PR namespaces for fresh start - After successful tests: Clean up automatically - After failed tests: Leave resources for debugging Documentation: - Add monitoring commands with PR number placeholder - Document multi-model testing architecture - Document CI cleanup behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/.github/workflows/ci-e2e-openshift.yaml b/.github/workflows/ci-e2e-openshift.yaml
diff --git a/charts/workload-variant-autoscaler/templates/hpa.yaml b/charts/workload-variant-autoscaler/templates/hpa.yaml
@@ -36,6 +36,7 @@ spec:
         selector:
           matchLabels:
             variant_name: {{ printf "%s-decode" .Values.llmd.modelName }}
+            exported_namespace: {{ .Values.llmd.namespace }}
       target:
         type: AverageValue
         averageValue: {{ .Values.hpa.targetAverageValue | quote }}
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -177,10 +177,15 @@ func (e *Engine) optimize(ctx context.Context) error {
 		vaMap[va.GetScaleTargetName()] = &va
 	}
 
-	for modelID, modelVAs := range modelGroups {
+	for groupKey, modelVAs := range modelGroups {
+		// The groupKey is "modelID|namespace" - extract actual modelID from VAs
+		// All VAs in the group have the same modelID and namespace
+		modelID := modelVAs[0].Spec.ModelID
 		logger.Info("Processing model",
 			"modelID", modelID,
-			"variantCount", len(modelVAs))
+			"namespace", modelVAs[0].Namespace,
+			"variantCount", len(modelVAs),
+			"groupKey", groupKey)
 
 		// Collect metrics and populate CurrentAlloc for saturation-only mode
 		// This validates metrics availability and populates the VariantAutoscalings with CurrentAlloc
diff --git a/internal/utils/variant.go b/internal/utils/variant.go
@@ -52,14 +52,18 @@ func InactiveVariantAutoscalingByModel(ctx context.Context, client client.Client
 	return GroupVariantAutoscalingByModel(vas), nil
 }
 
-// GroupVariantAutoscalingByModel groups VariantAutoscalings by model ID
+// GroupVariantAutoscalingByModel groups VariantAutoscalings by model ID AND namespace.
+// This is necessary because the same model deployed in different namespaces
+// should be treated as separate scaling domains for saturation analysis.
+// The key format is "modelID|namespace" to ensure proper isolation.
 func GroupVariantAutoscalingByModel(
 	vas []wvav1alpha1.VariantAutoscaling,
 ) map[string][]wvav1alpha1.VariantAutoscaling {
 	groups := make(map[string][]wvav1alpha1.VariantAutoscaling)
 	for _, va := range vas {
-		modelID := va.Spec.ModelID
-		groups[modelID] = append(groups[modelID], va)
+		// Use modelID + namespace as key to isolate VAs in different namespaces
+		key := va.Spec.ModelID + "|" + va.Namespace
+		groups[key] = append(groups[key], va)
 	}
 	return groups
 }
diff --git a/test/e2e-openshift/README.md b/test/e2e-openshift/README.md
@@ -196,6 +196,50 @@ Load generation job completed successfully
 Test completed - scaled from 1 to 2 replicas
 ```
 
+## Monitoring CI Runs
+
+When the CI e2e tests are running on OpenShift, you can monitor the cluster resources using the following commands. Replace `<PR_NUMBER>` with your actual PR number:
+
+```bash
+# Watch WVA controller pods
+oc get pods,deploy -n llm-d-autoscaler-pr-<PR_NUMBER>
+
+# Watch Model A (primary namespace) - pods, deployments, VAs, and HPAs
+oc get pods,deploy,va,hpa -n llm-d-inference-scheduler-pr-<PR_NUMBER>
+
+# Watch Model B (secondary namespace) - pods, deployments, VAs, and HPAs
+oc get pods,deploy,va,hpa -n llm-d-inference-scheduler-pr-<PR_NUMBER>-b
+
+# Watch all VAs across namespaces with detailed status
+oc get va -A -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,OPTIMIZED:.status.desiredOptimizedAlloc.numReplicas,REPLICAS:.status.currentAlloc.numReplicas,RATE:.status.currentAlloc.load.arrivalRate'
+
+# Watch load generation jobs
+oc get jobs -n llm-d-inference-scheduler-pr-<PR_NUMBER>
+oc get jobs -n llm-d-inference-scheduler-pr-<PR_NUMBER>-b
+```
+
+### CI Cleanup Behavior
+
+The CI workflow handles cleanup as follows:
+- **Before tests**: All PR-specific namespaces are cleaned up to ensure a fresh start
+- **After successful tests**: Resources are cleaned up automatically
+- **After failed tests**: Resources are left in place for debugging
+
+To manually trigger a run with cleanup disabled after success, use `SKIP_CLEANUP=true`.
+
+## Multi-Model Testing
+
+The CI tests deploy 2 models in 2 separate namespaces with 1 shared WVA controller:
+
+- **Model A**: Full llm-d stack in `llm-d-inference-scheduler-pr-<PR>`
+- **Model B**: Full llm-d stack in `llm-d-inference-scheduler-pr-<PR>-b`
+- **Shared WVA**: Single controller in `llm-d-autoscaler-pr-<PR>` managing both
+
+This validates:
+1. Multi-namespace WVA controller operation
+2. Independent scaling for each model
+3. Namespace isolation for metrics and HPA
+
 ## Troubleshooting
 
 ### Test Fails: Infrastructure Not Ready
diff --git a/test/e2e-openshift/e2e_suite_test.go b/test/e2e-openshift/e2e_suite_test.go
@@ -43,14 +43,14 @@ var (
 	controllerNamespace = getEnvString("CONTROLLER_NAMESPACE", "workload-variant-autoscaler-system")
 	monitoringNamespace = getEnvString("MONITORING_NAMESPACE", "openshift-user-workload-monitoring")
 	llmDNamespace       = getEnvString("LLMD_NAMESPACE", "llm-d-inference-scheduler")
-	gatewayName         = getEnvString("GATEWAY_NAME", "infra-inference-scheduling-inference-gateway")
-	modelID             = getEnvString("MODEL_ID", "unsloth/Meta-Llama-3.1-8B")
-	deployment          = getEnvString("DEPLOYMENT", "ms-inference-scheduling-llm-d-modelservice-decode")
-	requestRate         = getEnvInt("REQUEST_RATE", 20)
-	numPrompts          = getEnvInt("NUM_PROMPTS", 3000)
-	// WVA_RELEASE_NAME is used to filter resources for the current test run
-	// This prevents conflicts when multiple runs exist simultaneously
-	wvaReleaseName = getEnvString("WVA_RELEASE_NAME", "")
+	// Secondary llm-d namespace for Model B (multi-model testing)
+	llmDNamespaceB = getEnvString("LLMD_NAMESPACE_B", "")
+	gatewayName    = getEnvString("GATEWAY_NAME", "infra-inference-scheduling-inference-gateway")
+	modelID     = getEnvString("MODEL_ID", "unsloth/Meta-Llama-3.1-8B")
+	deployment     = getEnvString("DEPLOYMENT", "ms-inference-scheduling-llm-d-modelservice-decode")
+	requestRate    = getEnvInt("REQUEST_RATE", 20)
+	numPrompts     = getEnvInt("NUM_PROMPTS", 3000)
+	multiModelMode = llmDNamespaceB != ""
 )
 
 var (
@@ -136,6 +136,9 @@ var _ = BeforeSuite(func() {
 	_, _ = fmt.Fprintf(GinkgoWriter, "CONTROLLER_NAMESPACE=%s\n", controllerNamespace)
 	_, _ = fmt.Fprintf(GinkgoWriter, "MONITORING_NAMESPACE=%s\n", monitoringNamespace)
 	_, _ = fmt.Fprintf(GinkgoWriter, "LLMD_NAMESPACE=%s\n", llmDNamespace)
+	if multiModelMode {
+		_, _ = fmt.Fprintf(GinkgoWriter, "LLMD_NAMESPACE_B=%s (multi-model mode enabled)\n", llmDNamespaceB)
+	}
 	_, _ = fmt.Fprintf(GinkgoWriter, "GATEWAY_NAME=%s\n", gatewayName)
 	_, _ = fmt.Fprintf(GinkgoWriter, "MODEL_ID=%s\n", modelID)
 	_, _ = fmt.Fprintf(GinkgoWriter, "DEPLOYMENT=%s\n", deployment)
@@ -156,7 +159,7 @@ var _ = BeforeSuite(func() {
 		}
 	}, 2*time.Minute, 1*time.Second).Should(Succeed())
 
-	By("verifying that llm-d infrastructure is running")
+	By("verifying that llm-d infrastructure (Model A1) is running")
 	Eventually(func(g Gomega) {
 		// Check Gateway
 		deploymentList, err := k8sClient.AppsV1().Deployments(llmDNamespace).List(ctx, metav1.ListOptions{})
@@ -173,6 +176,35 @@ var _ = BeforeSuite(func() {
 		g.Expect(vllmDeployment.Status.ReadyReplicas).To(BeNumerically(">", 0), "At least one vLLM replica should be ready")
 	}, 5*time.Minute, 5*time.Second).Should(Succeed())
 
+	// Verify multi-model infrastructure if enabled
+	if multiModelMode {
+		By("verifying that Model B infrastructure is running")
+		Eventually(func(g Gomega) {
+			// Check that Model B namespace has deployments
+			deploymentList, err := k8sClient.AppsV1().Deployments(llmDNamespaceB).List(ctx, metav1.ListOptions{})
+			g.Expect(err).NotTo(HaveOccurred(), "Should be able to list deployments in Model B namespace")
+			g.Expect(deploymentList.Items).NotTo(BeEmpty(), "Model B deployments should exist")
+
+			// Check that Model B vLLM deployment exists
+			vllmDeployment, err := k8sClient.AppsV1().Deployments(llmDNamespaceB).Get(ctx, deployment, metav1.GetOptions{})
+			g.Expect(err).NotTo(HaveOccurred(), "Model B vLLM deployment should exist")
+			g.Expect(vllmDeployment.Status.ReadyReplicas).To(BeNumerically(">", 0), "At least one Model B replica should be ready")
+		}, 5*time.Minute, 5*time.Second).Should(Succeed())
+
+		By("verifying WVA resources for all models")
+		Eventually(func(g Gomega) {
+			// Check that VariantAutoscaling resources exist for all models
+			vaList := &v1alpha1.VariantAutoscalingList{}
+			err := crClient.List(ctx, vaList, client.MatchingLabels{
+				"app.kubernetes.io/name": "workload-variant-autoscaler",
+			})
+			g.Expect(err).NotTo(HaveOccurred(), "Should be able to list VariantAutoscalings")
+			// Expect at least 2 VAs: Model A1 and Model B
+			_, _ = fmt.Fprintf(GinkgoWriter, "Found %d VariantAutoscaling resources\n", len(vaList.Items))
+			g.Expect(len(vaList.Items)).To(BeNumerically(">=", 2), "Should have at least 2 VariantAutoscaling resources for multi-model mode")
+		}, 2*time.Minute, 5*time.Second).Should(Succeed())
+	}
+
 	By("verifying that Prometheus Adapter is running")
 	Eventually(func(g Gomega) {
 		podList, err := k8sClient.CoreV1().Pods(monitoringNamespace).List(ctx, metav1.ListOptions{
diff --git a/test/e2e-openshift/sharegpt_scaleup_test.go b/test/e2e-openshift/sharegpt_scaleup_test.go