Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions deploy/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,59 @@ log_error() {
exit 1
}

# APIService guard: background loop that continuously ensures the
# v1beta1.external.metrics.k8s.io APIService points to prometheus-adapter.
# On clusters with KEDA, the operator continuously reconciles the APIService
# back to keda-metrics-apiserver, breaking HPA scaling for WVA.
# This guard re-patches it every 10 seconds without modifying KEDA itself.
APISERVICE_GUARD_PID=""

start_apiservice_guard() {
local monitoring_ns="$1"
log_info "Starting APIService guard (background re-patch loop every 10s)"
(
while true; do
sleep 10
current_svc=$(kubectl get apiservice v1beta1.external.metrics.k8s.io \
-o jsonpath='{.spec.service.name}' 2>/dev/null || echo "")
current_ns=$(kubectl get apiservice v1beta1.external.metrics.k8s.io \
-o jsonpath='{.spec.service.namespace}' 2>/dev/null || echo "")
if [ "$current_svc" != "prometheus-adapter" ] || [ "$current_ns" != "$monitoring_ns" ]; then
echo "[apiservice-guard] KEDA reclaimed APIService (now: $current_svc/$current_ns), re-patching to prometheus-adapter/$monitoring_ns"
Copy link

Copilot AI Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This log line hard-codes “KEDA reclaimed APIService”, but the condition triggers for any mismatch (e.g., a different external-metrics provider or an empty service/namespace when the APIService is temporarily unavailable). Consider making the message provider-agnostic (e.g., “APIService drift detected”) and include the observed service/namespace without attributing it to KEDA unless KEDA is explicitly detected.

Suggested change
echo "[apiservice-guard] KEDA reclaimed APIService (now: $current_svc/$current_ns), re-patching to prometheus-adapter/$monitoring_ns"
echo "[apiservice-guard] APIService drift detected: v1beta1.external.metrics.k8s.io now points to $current_svc/$current_ns; re-patching to prometheus-adapter/$monitoring_ns"

Copilot uses AI. Check for mistakes.
kubectl patch apiservice v1beta1.external.metrics.k8s.io --type=merge -p "{
\"spec\": {
\"insecureSkipTLSVerify\": true,
\"service\": {
\"name\": \"prometheus-adapter\",
\"namespace\": \"$monitoring_ns\"
}
}
}" 2>/dev/null || true
fi
done
) &
APISERVICE_GUARD_PID=$!
echo "$APISERVICE_GUARD_PID" > /tmp/apiservice-guard.pid
log_success "APIService guard started (PID: $APISERVICE_GUARD_PID)"
Comment on lines +170 to +172
Copy link

Copilot AI Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The guard PID is written to a fixed path (/tmp/apiservice-guard.pid). This can collide across concurrent runs on the same runner and can become stale; since PIDs are reused, later stop_apiservice_guard invocations risk killing an unrelated process. Consider making the pidfile unique per run (e.g., include $$ / a unique suffix) and validating the process identity before killing.

Copilot uses AI. Check for mistakes.
}

stop_apiservice_guard() {
if [ -n "$APISERVICE_GUARD_PID" ] && kill -0 "$APISERVICE_GUARD_PID" 2>/dev/null; then
log_info "Stopping APIService guard (PID: $APISERVICE_GUARD_PID)"
kill "$APISERVICE_GUARD_PID" 2>/dev/null || true
wait "$APISERVICE_GUARD_PID" 2>/dev/null || true
elif [ -f /tmp/apiservice-guard.pid ]; then
local pid
pid=$(cat /tmp/apiservice-guard.pid)
if kill -0 "$pid" 2>/dev/null; then
log_info "Stopping APIService guard (PID: $pid from pidfile)"
kill "$pid" 2>/dev/null || true
fi
fi
rm -f /tmp/apiservice-guard.pid
APISERVICE_GUARD_PID=""
}

print_help() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Expand Down Expand Up @@ -1019,6 +1072,12 @@ deploy_prometheus_adapter() {
}" && log_success "APIService patched to use Prometheus Adapter" \
|| log_warning "Failed to patch external.metrics.k8s.io APIService — HPA may not work"
fi

# Start background guard to prevent KEDA from reclaiming the APIService.
# KEDA's operator continuously reconciles the APIService back to its own
# metrics server within ~2 minutes of any patch. The guard re-patches it
# every 10 seconds without modifying KEDA itself.
start_apiservice_guard "$MONITORING_NAMESPACE"
Comment on lines 1074 to +1080
Copy link

Copilot AI Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

start_apiservice_guard is started whenever the APIService exists, even if it already points at prometheus-adapter/$MONITORING_NAMESPACE. If the guard is only intended to mitigate KEDA’s reconciliation, consider starting it only when the APIService is observed pointing somewhere else (or when it’s specifically pointing to keda-metrics-apiserver) to avoid running an always-on background loop on clusters that don’t need it.

Suggested change
fi
# Start background guard to prevent KEDA from reclaiming the APIService.
# KEDA's operator continuously reconciles the APIService back to its own
# metrics server within ~2 minutes of any patch. The guard re-patches it
# every 10 seconds without modifying KEDA itself.
start_apiservice_guard "$MONITORING_NAMESPACE"
# Start background guard to prevent KEDA from reclaiming the APIService.
# KEDA's operator continuously reconciles the APIService back to its own
# metrics server within ~2 minutes of any patch. The guard re-patches it
# every 10 seconds without modifying KEDA itself.
start_apiservice_guard "$MONITORING_NAMESPACE"
fi

Copilot uses AI. Check for mistakes.
Comment on lines +1076 to +1080
Copy link

Copilot AI Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

start_apiservice_guard is started here, but the normal deployment path never calls stop_apiservice_guard before main exits. In a non-interactive script, this background loop can outlive the script and continue patching the cluster unexpectedly (and may interfere with later CI cleanup that deletes the adapter/namespace). Consider giving the guard an explicit lifecycle (e.g., start only for E2E runs and add a corresponding stop action invoked by CI cleanup, or stop it automatically on script exit when persistence isn’t required).

Copilot uses AI. Check for mistakes.
else
log_warning "external.metrics.k8s.io APIService not found — skipping patch"
fi
Expand Down Expand Up @@ -1182,6 +1241,10 @@ print_summary() {
# Undeployment functions
undeploy_prometheus_adapter() {
log_info "Uninstalling Prometheus Adapter..."

# Stop the APIService guard if running
stop_apiservice_guard

helm uninstall prometheus-adapter -n $MONITORING_NAMESPACE 2>/dev/null || \
log_warning "Prometheus Adapter not found or already uninstalled"

Expand Down Expand Up @@ -1257,6 +1320,9 @@ cleanup() {
log_info "======================================"
echo ""

# Stop the APIService guard if running (safety net)
stop_apiservice_guard

# Undeploy environment-specific components (Prometheus, etc.)
if [ "$DEPLOY_PROMETHEUS" = "true" ]; then
undeploy_prometheus_stack
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-openshift/scale_to_zero_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ retention_period: %s`, retentionPeriod),
// and gracefully skip if scale-to-zero cannot be validated.

scaledToZero := false
// Wait for retention period (3m) + buffer (2m) = 5m total
deadline := time.Now().Add(5 * time.Minute)
// Wait for retention period (3m) + buffer (7m) = 10m total
deadline := time.Now().Add(10 * time.Minute)

for time.Now().Before(deadline) {
va := &v1alpha1.VariantAutoscaling{}
Expand Down
2 changes: 1 addition & 1 deletion test/e2e-openshift/sharegpt_scaleup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ var _ = Describe("ShareGPT Scale-Up Test", Ordered, func() {
_, _ = fmt.Fprintf(GinkgoWriter, "External metrics API response (selector: %s): %s\n",
hpaMetricSelector, truncateString(resultStr, 500))
}
}, 5*time.Minute, 5*time.Second).Should(Succeed())
}, 10*time.Minute, 5*time.Second).Should(Succeed())
})

It("should create and run parallel load generation jobs", func() {
Expand Down
Loading