Skip to content

Commit e19ac57

Browse files
clubandersonclaude
andcommitted
πŸ› Fix flaky Model B e2e-openshift HPA scale-up test
The HPA monitoring timeout (5 min) was too tight β€” Model A already takes ~3.7 min for HPA to reflect VA recommendations through the external metrics pipeline (Prometheus β†’ Adapter β†’ HPA). Model B, running second, had insufficient margin. Changes: - Increase HPA scale-up monitoring timeout from 5 to 8 minutes - Enhance external metrics API check to use exact HPA label selectors (including controller_instance), catching label propagation issues that a bare query would miss - Add diagnostic external metrics queries and HPA condition logging during the HPA monitoring loop to aid debugging future failures - Add truncateString helper for log output Signed-off-by: Andy Anderson <andy@clubanderson.com> Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Andrew Anderson <andy@clubanderson.com>
1 parent 2254ec9 commit e19ac57

File tree

1 file changed

+57
-8
lines changed

1 file changed

+57
-8
lines changed

β€Žtest/e2e-openshift/sharegpt_scaleup_test.goβ€Ž

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,14 @@ func sanitizeK8sName(name string) string {
5656

5757
var lowLoad = numPrompts <= 2000 && requestRate <= 8
5858

59+
// truncateString returns the first n characters of s, appending "..." if truncated.
60+
func truncateString(s string, n int) string {
61+
if len(s) <= n {
62+
return s
63+
}
64+
return s[:n] + "..."
65+
}
66+
5967
// Load generation configuration constants
6068
// These values were tuned empirically to achieve ~2-3 replica scale-up without excessive scaling.
6169
// Original values (baseLoadWorkers=10, batchSize=50, batchSleepDuration=0.1) caused cascade
@@ -156,7 +164,8 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
156164
vaName string
157165
scaledReplicas int32
158166
scaledOptimized int32
159-
scaledLoadWorkers int // Load workers scaled to initial replicas
167+
scaledLoadWorkers int // Load workers scaled to initial replicas
168+
hpaMetricSelector string // Label selector matching the HPA's external metric query
160169
jobCompletionTimeout = 10 * time.Minute
161170
)
162171

@@ -232,6 +241,17 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
232241
Expect(hpa.Spec.Metrics[0].Type).To(Equal(autoscalingv2.ExternalMetricSourceType), "HPA should use external metrics")
233242
Expect(hpa.Spec.Metrics[0].External.Metric.Name).To(Equal(constants.WVADesiredReplicas), "HPA should use wva_desired_replicas metric")
234243

244+
// Extract the HPA's metric label selector for diagnostic external metrics queries
245+
// This allows us to query the external metrics API with the exact same labels the HPA uses
246+
if hpa.Spec.Metrics[0].External.Metric.Selector != nil {
247+
var selectorParts []string
248+
for k, v := range hpa.Spec.Metrics[0].External.Metric.Selector.MatchLabels {
249+
selectorParts = append(selectorParts, fmt.Sprintf("%s=%s", k, v))
250+
}
251+
hpaMetricSelector = strings.Join(selectorParts, ",")
252+
_, _ = fmt.Fprintf(GinkgoWriter, "HPA metric selector: %s\n", hpaMetricSelector)
253+
}
254+
235255
By("verifying gateway service exists for load routing")
236256
// Traffic goes through the Istio gateway to be properly routed via InferencePool/EPP
237257
// The gateway service is created by the llm-d-infra chart
@@ -309,15 +329,23 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
309329
})
310330

311331
It("should verify external metrics API is accessible", func() {
312-
By("querying external metrics API for wva_desired_replicas")
332+
By("querying external metrics API for wva_desired_replicas with exact HPA label selectors")
313333
Eventually(func(g Gomega) {
314-
result, err := k8sClient.RESTClient().
334+
// Query with the exact label selectors the HPA uses (including controller_instance
335+
// if set). This catches label propagation issues that a bare query would miss.
336+
req := k8sClient.RESTClient().
315337
Get().
316-
AbsPath("/apis/external.metrics.k8s.io/v1beta1/namespaces/" + model.namespace + "/" + constants.WVADesiredReplicas).
317-
DoRaw(ctx)
338+
AbsPath("/apis/external.metrics.k8s.io/v1beta1/namespaces/" + model.namespace + "/" + constants.WVADesiredReplicas)
339+
if hpaMetricSelector != "" {
340+
req = req.Param("labelSelector", hpaMetricSelector)
341+
}
342+
result, err := req.DoRaw(ctx)
318343
g.Expect(err).NotTo(HaveOccurred(), "Should be able to query external metrics API")
319-
g.Expect(string(result)).To(ContainSubstring(constants.WVADesiredReplicas), "Metric should be available")
320-
g.Expect(string(result)).To(ContainSubstring(vaName), "Metric should be for the correct variant")
344+
resultStr := string(result)
345+
g.Expect(resultStr).To(ContainSubstring(constants.WVADesiredReplicas), "Metric should be available")
346+
g.Expect(resultStr).To(ContainSubstring(vaName), "Metric should be for the correct variant")
347+
_, _ = fmt.Fprintf(GinkgoWriter, "External metrics API response (selector: %s): %s\n",
348+
hpaMetricSelector, truncateString(resultStr, 500))
321349
}, 5*time.Minute, 5*time.Second).Should(Succeed())
322350
})
323351

@@ -488,12 +516,33 @@ exit 1`,
488516
_, _ = fmt.Fprintf(GinkgoWriter, "HPA desiredReplicas: %d, currentReplicas: %d\n",
489517
hpa.Status.DesiredReplicas, hpa.Status.CurrentReplicas)
490518

519+
// Log HPA conditions for diagnostic insight (e.g., ScalingActive, AbleToScale)
520+
for _, cond := range hpa.Status.Conditions {
521+
if cond.Status != "True" || cond.Type == autoscalingv2.ScalingActive {
522+
_, _ = fmt.Fprintf(GinkgoWriter, " HPA condition %s=%s: %s\n",
523+
cond.Type, cond.Status, cond.Message)
524+
}
525+
}
526+
527+
// Diagnostic: query external metrics API with exact HPA labels to see what the adapter returns
528+
if hpaMetricSelector != "" {
529+
if result, qErr := k8sClient.RESTClient().
530+
Get().
531+
AbsPath("/apis/external.metrics.k8s.io/v1beta1/namespaces/" + model.namespace + "/" + constants.WVADesiredReplicas).
532+
Param("labelSelector", hpaMetricSelector).
533+
DoRaw(ctx); qErr == nil {
534+
_, _ = fmt.Fprintf(GinkgoWriter, " External metric (HPA labels): %s\n", truncateString(string(result), 300))
535+
} else {
536+
_, _ = fmt.Fprintf(GinkgoWriter, " External metric query error: %v\n", qErr)
537+
}
538+
}
539+
491540
if !lowLoad {
492541
// HPA should also desire more replicas than initial
493542
g.Expect(hpa.Status.DesiredReplicas).To(BeNumerically(">", initialOptimized),
494543
fmt.Sprintf("HPA should desire more replicas than initial (desired: %d, initial: %d)", hpa.Status.DesiredReplicas, initialOptimized))
495544
}
496-
}, 5*time.Minute, 10*time.Second).Should(Succeed())
545+
}, 8*time.Minute, 10*time.Second).Should(Succeed())
497546

498547
_, _ = fmt.Fprintf(GinkgoWriter, "WVA detected load and recommended %d replicas (up from %d)\n", scaledOptimized, initialOptimized)
499548
})

0 commit comments

Comments
Β (0)