Skip to content

Commit f5ec004

Browse files
clubandersonclaude
andcommitted
fix: resolve KEDA APIService conflict for external metrics
Three fixes for the nightly E2E test failures: 1. DEFAULT_MODEL_ID: Update from Qwen/Qwen3-32B to Qwen/Qwen3-0.6B to match the llm-d repo's current default model. The stale value caused yq replacement to silently fail, making WVA query wrong model metrics. 2. WVA_METRICS_SECURE: Add env var to control bearer token auth on the WVA /metrics endpoint. OpenShift's user-workload-monitoring cannot authenticate with the controller-manager SA token. 3. KEDA APIService conflict: On clusters with KEDA, the v1beta1.external.metrics.k8s.io APIService points to KEDA's metrics server, which only serves ScaledObject metrics. After deploying Prometheus Adapter, detect and patch the APIService to point to Prometheus Adapter instead. Supersedes #720. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Andrew Anderson <andy@clubanderson.com>
1 parent 652fa2f commit f5ec004

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

deploy/install.sh

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ INSTALL_GATEWAY_CTRLPLANE_ORIGINAL="${INSTALL_GATEWAY_CTRLPLANE:-}"
6868
INSTALL_GATEWAY_CTRLPLANE="${INSTALL_GATEWAY_CTRLPLANE:-false}"
6969

7070
# Model and SLO Configuration
71-
DEFAULT_MODEL_ID=${DEFAULT_MODEL_ID:-"Qwen/Qwen3-32B"}
71+
DEFAULT_MODEL_ID=${DEFAULT_MODEL_ID:-"Qwen/Qwen3-0.6B"}
7272
MODEL_ID=${MODEL_ID:-"unsloth/Meta-Llama-3.1-8B"}
7373
ACCELERATOR_TYPE=${ACCELERATOR_TYPE:-"H100"}
7474
SLO_TPOT=${SLO_TPOT:-10} # Target time-per-output-token SLO (in ms)
@@ -96,6 +96,8 @@ HPA_STABILIZATION_SECONDS=${HPA_STABILIZATION_SECONDS:-240}
9696
HPA_MIN_REPLICAS=${HPA_MIN_REPLICAS:-1}
9797
SKIP_CHECKS=${SKIP_CHECKS:-false}
9898
E2E_TESTS_ENABLED=${E2E_TESTS_ENABLED:-false}
99+
# WVA metrics endpoint security (set false to disable bearer token auth on /metrics)
100+
WVA_METRICS_SECURE=${WVA_METRICS_SECURE:-true}
99101
# vLLM max-num-seqs (max concurrent sequences per replica, lower = easier to saturate for testing)
100102
VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-""}
101103
# Decode replicas override (useful for e2e testing with limited GPUs)
@@ -454,8 +456,9 @@ deploy_wva_controller() {
454456
--set wva.logging.level=$WVA_LOG_LEVEL \
455457
--set wva.prometheus.tls.insecureSkipVerify=$SKIP_TLS_VERIFY \
456458
--set wva.namespaceScoped=$NAMESPACE_SCOPED \
459+
--set wva.metrics.secure=$WVA_METRICS_SECURE \
457460
${CONTROLLER_INSTANCE:+--set wva.controllerInstance=$CONTROLLER_INSTANCE}
458-
461+
459462
# Wait for WVA to be ready
460463
log_info "Waiting for WVA controller to be ready..."
461464
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=workload-variant-autoscaler -n $WVA_NS --timeout=60s || \
@@ -877,6 +880,30 @@ deploy_prometheus_adapter() {
877880
}
878881

879882
log_success "Prometheus Adapter deployment initiated (may still be starting)"
883+
884+
# On clusters with KEDA, the v1beta1.external.metrics.k8s.io APIService may
885+
# point to KEDA's metrics server instead of Prometheus Adapter. KEDA's server
886+
# only serves metrics for ScaledObjects, not arbitrary external metrics like
887+
# wva_desired_replicas. Detect and fix this conflict.
888+
local current_svc
889+
current_svc=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.name}' 2>/dev/null || echo "")
890+
891+
if [ -n "$current_svc" ] && [ "$current_svc" != "prometheus-adapter" ]; then
892+
log_warning "external.metrics.k8s.io APIService points to '$current_svc' (likely KEDA)"
893+
log_info "Patching APIService to point to Prometheus Adapter in $MONITORING_NAMESPACE"
894+
kubectl patch apiservice v1beta1.external.metrics.k8s.io --type=merge -p "{
895+
\"spec\": {
896+
\"insecureSkipTLSVerify\": true,
897+
\"service\": {
898+
\"name\": \"prometheus-adapter\",
899+
\"namespace\": \"$MONITORING_NAMESPACE\"
900+
}
901+
}
902+
}" && log_success "APIService patched to use Prometheus Adapter" \
903+
|| log_warning "Failed to patch external.metrics.k8s.io APIService — HPA may not work"
904+
else
905+
log_info "external.metrics.k8s.io APIService already points to prometheus-adapter"
906+
fi
880907
}
881908

882909
verify_deployment() {

0 commit comments

Comments
 (0)