Skip to content

Commit e25066c

Browse files
clubandersonclaude
andcommitted
feat(e2e): multi-model testing with 2 models in 2 namespaces, 1 shared WVA
Multi-model E2E Testing: - Deploy 2 models in 2 namespaces with 1 shared WVA controller - Model A in llm-d-inference-scheduler-pr-XXX - Model B in llm-d-inference-scheduler-pr-XXX-b - Shared WVA in llm-d-autoscaler-pr-XXX Test Improvements: - Move HPA retrieval before VA stabilization to know minReplicas - Wait for VA to stabilize at exact minReplicas before load test - Increase stabilization timeout to 5 minutes - Route load through Istio gateway instead of direct vLLM service CI Cleanup Behavior: - Before tests: Clean up all PR namespaces for fresh start - After successful tests: Clean up automatically - After failed tests: Leave resources for debugging Documentation: - Add monitoring commands with PR number placeholder - Document multi-model testing architecture - Document CI cleanup behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e30d6bf commit e25066c

File tree

7 files changed

+766
-526
lines changed

7 files changed

+766
-526
lines changed

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 275 additions & 47 deletions
Large diffs are not rendered by default.

charts/workload-variant-autoscaler/templates/hpa.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ spec:
3636
selector:
3737
matchLabels:
3838
variant_name: {{ printf "%s-decode" .Values.llmd.modelName }}
39+
exported_namespace: {{ .Values.llmd.namespace }}
3940
target:
4041
type: AverageValue
4142
averageValue: {{ .Values.hpa.targetAverageValue | quote }}

internal/engines/saturation/engine.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,15 @@ func (e *Engine) optimize(ctx context.Context) error {
177177
vaMap[va.GetScaleTargetName()] = &va
178178
}
179179

180-
for modelID, modelVAs := range modelGroups {
180+
for groupKey, modelVAs := range modelGroups {
181+
// The groupKey is "modelID|namespace" - extract actual modelID from VAs
182+
// All VAs in the group have the same modelID and namespace
183+
modelID := modelVAs[0].Spec.ModelID
181184
logger.Info("Processing model",
182185
"modelID", modelID,
183-
"variantCount", len(modelVAs))
186+
"namespace", modelVAs[0].Namespace,
187+
"variantCount", len(modelVAs),
188+
"groupKey", groupKey)
184189

185190
// Collect metrics and populate CurrentAlloc for saturation-only mode
186191
// This validates metrics availability and populates the VariantAutoscalings with CurrentAlloc

internal/utils/variant.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,18 @@ func InactiveVariantAutoscalingByModel(ctx context.Context, client client.Client
5252
return GroupVariantAutoscalingByModel(vas), nil
5353
}
5454

55-
// GroupVariantAutoscalingByModel groups VariantAutoscalings by model ID
55+
// GroupVariantAutoscalingByModel groups VariantAutoscalings by model ID AND namespace.
56+
// This is necessary because the same model deployed in different namespaces
57+
// should be treated as separate scaling domains for saturation analysis.
58+
// The key format is "modelID|namespace" to ensure proper isolation.
5659
func GroupVariantAutoscalingByModel(
5760
vas []wvav1alpha1.VariantAutoscaling,
5861
) map[string][]wvav1alpha1.VariantAutoscaling {
5962
groups := make(map[string][]wvav1alpha1.VariantAutoscaling)
6063
for _, va := range vas {
61-
modelID := va.Spec.ModelID
62-
groups[modelID] = append(groups[modelID], va)
64+
// Use modelID + namespace as key to isolate VAs in different namespaces
65+
key := va.Spec.ModelID + "|" + va.Namespace
66+
groups[key] = append(groups[key], va)
6367
}
6468
return groups
6569
}

test/e2e-openshift/README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,50 @@ Load generation job completed successfully
196196
Test completed - scaled from 1 to 2 replicas
197197
```
198198

199+
## Monitoring CI Runs
200+
201+
When the CI e2e tests are running on OpenShift, you can monitor the cluster resources using the following commands. Replace `<PR_NUMBER>` with your actual PR number:
202+
203+
```bash
204+
# Watch WVA controller pods
205+
oc get pods,deploy -n llm-d-autoscaler-pr-<PR_NUMBER>
206+
207+
# Watch Model A (primary namespace) - pods, deployments, VAs, and HPAs
208+
oc get pods,deploy,va,hpa -n llm-d-inference-scheduler-pr-<PR_NUMBER>
209+
210+
# Watch Model B (secondary namespace) - pods, deployments, VAs, and HPAs
211+
oc get pods,deploy,va,hpa -n llm-d-inference-scheduler-pr-<PR_NUMBER>-b
212+
213+
# Watch all VAs across namespaces with detailed status
214+
oc get va -A -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,OPTIMIZED:.status.desiredOptimizedAlloc.numReplicas,REPLICAS:.status.currentAlloc.numReplicas,RATE:.status.currentAlloc.load.arrivalRate'
215+
216+
# Watch load generation jobs
217+
oc get jobs -n llm-d-inference-scheduler-pr-<PR_NUMBER>
218+
oc get jobs -n llm-d-inference-scheduler-pr-<PR_NUMBER>-b
219+
```
220+
221+
### CI Cleanup Behavior
222+
223+
The CI workflow handles cleanup as follows:
224+
- **Before tests**: All PR-specific namespaces are cleaned up to ensure a fresh start
225+
- **After successful tests**: Resources are cleaned up automatically
226+
- **After failed tests**: Resources are left in place for debugging
227+
228+
To manually trigger a run with cleanup disabled after success, use `SKIP_CLEANUP=true`.
229+
230+
## Multi-Model Testing
231+
232+
The CI tests deploy 2 models in 2 separate namespaces with 1 shared WVA controller:
233+
234+
- **Model A**: Full llm-d stack in `llm-d-inference-scheduler-pr-<PR>`
235+
- **Model B**: Full llm-d stack in `llm-d-inference-scheduler-pr-<PR>-b`
236+
- **Shared WVA**: Single controller in `llm-d-autoscaler-pr-<PR>` managing both
237+
238+
This validates:
239+
1. Multi-namespace WVA controller operation
240+
2. Independent scaling for each model
241+
3. Namespace isolation for metrics and HPA
242+
199243
## Troubleshooting
200244

201245
### Test Fails: Infrastructure Not Ready

test/e2e-openshift/e2e_suite_test.go

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,14 @@ var (
4343
controllerNamespace = getEnvString("CONTROLLER_NAMESPACE", "workload-variant-autoscaler-system")
4444
monitoringNamespace = getEnvString("MONITORING_NAMESPACE", "openshift-user-workload-monitoring")
4545
llmDNamespace = getEnvString("LLMD_NAMESPACE", "llm-d-inference-scheduler")
46-
gatewayName = getEnvString("GATEWAY_NAME", "infra-inference-scheduling-inference-gateway")
47-
modelID = getEnvString("MODEL_ID", "unsloth/Meta-Llama-3.1-8B")
48-
deployment = getEnvString("DEPLOYMENT", "ms-inference-scheduling-llm-d-modelservice-decode")
49-
requestRate = getEnvInt("REQUEST_RATE", 20)
50-
numPrompts = getEnvInt("NUM_PROMPTS", 3000)
51-
// WVA_RELEASE_NAME is used to filter resources for the current test run
52-
// This prevents conflicts when multiple runs exist simultaneously
53-
wvaReleaseName = getEnvString("WVA_RELEASE_NAME", "")
46+
// Secondary llm-d namespace for Model B (multi-model testing)
47+
llmDNamespaceB = getEnvString("LLMD_NAMESPACE_B", "")
48+
gatewayName = getEnvString("GATEWAY_NAME", "infra-inference-scheduling-inference-gateway")
49+
modelID = getEnvString("MODEL_ID", "unsloth/Meta-Llama-3.1-8B")
50+
deployment = getEnvString("DEPLOYMENT", "ms-inference-scheduling-llm-d-modelservice-decode")
51+
requestRate = getEnvInt("REQUEST_RATE", 20)
52+
numPrompts = getEnvInt("NUM_PROMPTS", 3000)
53+
multiModelMode = llmDNamespaceB != ""
5454
)
5555

5656
var (
@@ -136,6 +136,9 @@ var _ = BeforeSuite(func() {
136136
_, _ = fmt.Fprintf(GinkgoWriter, "CONTROLLER_NAMESPACE=%s\n", controllerNamespace)
137137
_, _ = fmt.Fprintf(GinkgoWriter, "MONITORING_NAMESPACE=%s\n", monitoringNamespace)
138138
_, _ = fmt.Fprintf(GinkgoWriter, "LLMD_NAMESPACE=%s\n", llmDNamespace)
139+
if multiModelMode {
140+
_, _ = fmt.Fprintf(GinkgoWriter, "LLMD_NAMESPACE_B=%s (multi-model mode enabled)\n", llmDNamespaceB)
141+
}
139142
_, _ = fmt.Fprintf(GinkgoWriter, "GATEWAY_NAME=%s\n", gatewayName)
140143
_, _ = fmt.Fprintf(GinkgoWriter, "MODEL_ID=%s\n", modelID)
141144
_, _ = fmt.Fprintf(GinkgoWriter, "DEPLOYMENT=%s\n", deployment)
@@ -156,7 +159,7 @@ var _ = BeforeSuite(func() {
156159
}
157160
}, 2*time.Minute, 1*time.Second).Should(Succeed())
158161

159-
By("verifying that llm-d infrastructure is running")
162+
By("verifying that llm-d infrastructure (Model A1) is running")
160163
Eventually(func(g Gomega) {
161164
// Check Gateway
162165
deploymentList, err := k8sClient.AppsV1().Deployments(llmDNamespace).List(ctx, metav1.ListOptions{})
@@ -173,6 +176,35 @@ var _ = BeforeSuite(func() {
173176
g.Expect(vllmDeployment.Status.ReadyReplicas).To(BeNumerically(">", 0), "At least one vLLM replica should be ready")
174177
}, 5*time.Minute, 5*time.Second).Should(Succeed())
175178

179+
// Verify multi-model infrastructure if enabled
180+
if multiModelMode {
181+
By("verifying that Model B infrastructure is running")
182+
Eventually(func(g Gomega) {
183+
// Check that Model B namespace has deployments
184+
deploymentList, err := k8sClient.AppsV1().Deployments(llmDNamespaceB).List(ctx, metav1.ListOptions{})
185+
g.Expect(err).NotTo(HaveOccurred(), "Should be able to list deployments in Model B namespace")
186+
g.Expect(deploymentList.Items).NotTo(BeEmpty(), "Model B deployments should exist")
187+
188+
// Check that Model B vLLM deployment exists
189+
vllmDeployment, err := k8sClient.AppsV1().Deployments(llmDNamespaceB).Get(ctx, deployment, metav1.GetOptions{})
190+
g.Expect(err).NotTo(HaveOccurred(), "Model B vLLM deployment should exist")
191+
g.Expect(vllmDeployment.Status.ReadyReplicas).To(BeNumerically(">", 0), "At least one Model B replica should be ready")
192+
}, 5*time.Minute, 5*time.Second).Should(Succeed())
193+
194+
By("verifying WVA resources for all models")
195+
Eventually(func(g Gomega) {
196+
// Check that VariantAutoscaling resources exist for all models
197+
vaList := &v1alpha1.VariantAutoscalingList{}
198+
err := crClient.List(ctx, vaList, client.MatchingLabels{
199+
"app.kubernetes.io/name": "workload-variant-autoscaler",
200+
})
201+
g.Expect(err).NotTo(HaveOccurred(), "Should be able to list VariantAutoscalings")
202+
// Expect at least 2 VAs: Model A1 and Model B
203+
_, _ = fmt.Fprintf(GinkgoWriter, "Found %d VariantAutoscaling resources\n", len(vaList.Items))
204+
g.Expect(len(vaList.Items)).To(BeNumerically(">=", 2), "Should have at least 2 VariantAutoscaling resources for multi-model mode")
205+
}, 2*time.Minute, 5*time.Second).Should(Succeed())
206+
}
207+
176208
By("verifying that Prometheus Adapter is running")
177209
Eventually(func(g Gomega) {
178210
podList, err := k8sClient.CoreV1().Pods(monitoringNamespace).List(ctx, metav1.ListOptions{

0 commit comments

Comments
 (0)