From 85398c9cdcecf7d94f0c38ede04285a263392e42 Mon Sep 17 00:00:00 2001
From: Your Name <conliu@google.com>
Date: Fri, 19 Sep 2025 10:42:27 -0700
Subject: [PATCH] Consolidate ha config into a single enableLeaderElection,
 also fix rolling update stuck bug

---
 config/charts/inferencepool/README.md         | 34 ++++++++++++-------
 .../templates/epp-deployment.yaml             | 30 ++++++++++++----
 .../charts/inferencepool/templates/gke.yaml   |  4 +++
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
index b6629d2b8..9a8be09f9 100644
--- a/config/charts/inferencepool/README.md
+++ b/config/charts/inferencepool/README.md
@@ -103,19 +103,30 @@ $ helm install triton-llama3-8b-instruct \
 
 To deploy the EndpointPicker in a high-availability (HA) active-passive configuration, you can enable leader election. When enabled, the EPP deployment will have multiple replicas, but only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity.
 
-To enable HA, set `inferenceExtension.flags.has-enable-leader-election` to `true` and increase the number of replicas in your `values.yaml` file:
+To enable HA, set `inferenceExtension.enableLeaderElection` to `true`.
 
-```yaml
-inferenceExtension:
-  replicas: 3
-  has-enable-leader-election: true
-```
+* Via `--set` flag:
 
-Then apply it with:
+  ```txt
+  helm install vllm-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+  --set inferenceExtension.enableLeaderElection=true \
+  --set provider=[none|gke] \
+  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
+  ```
 
-```txt
-helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
-```
+* Via `values.yaml`:
+
+  ```yaml
+  inferenceExtension:
+    enableLeaderElection: true
+  ```
+
+  Then apply it with:
+
+  ```txt
+  helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+  ```
 
 ### Install with Monitoring
 
@@ -171,8 +182,7 @@ The following table list the configurable parameters of the chart.
 | `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                          |
 | `inferenceExtension.flags`                  | List of flags which are passed through to endpoint picker. Example flags, enable-pprof, grpc-port etc. Refer [runner.go](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/runner/runner.go) for complete list.                                                            |
 | `inferenceExtension.affinity`               | Affinity for the endpoint picker. Defaults to `{}`.                                                                    |
-| `inferenceExtension.tolerations`            | Tolerations for the endpoint picker. Defaults to `[]`.                                                                 |
-| `inferenceExtension.flags.has-enable-leader-election` | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic.       |
+| `inferenceExtension.tolerations`            | Tolerations for the endpoint picker. Defaults to `[]`.                                                                 |   |
 | `inferenceExtension.monitoring.interval`   | Metrics scraping interval for monitoring. Defaults to `10s`.                                                           |
 | `inferenceExtension.monitoring.secret.name` | Name of the service account token secret for metrics authentication. Defaults to `inference-gateway-sa-metrics-reader-secret`. |
 | `inferenceExtension.monitoring.prometheus.enabled` | Enable Prometheus ServiceMonitor creation for EPP metrics collection. Defaults to `false`.                      |
diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
index abdc05a2c..d37ba3e94 100644
--- a/config/charts/inferencepool/templates/epp-deployment.yaml
+++ b/config/charts/inferencepool/templates/epp-deployment.yaml
@@ -6,7 +6,19 @@ metadata:
   labels:
     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 spec:
-  replicas: {{ .Values.inferenceExtension.replicas | default 1 }}
+  {{- if .Values.inferenceExtension.enableLeaderElection }}
+  replicas: 3
+  {{- else }}
+  replicas: 1
+  {{- end }}
+  strategy:
+    # The current recommended EPP deployment pattern is to have a single active replica. This ensures 
+    # optimal performance of the stateful operations such prefix cache aware scorer.
+    # The Recreate strategy the old replica is killed immediately, and allow the new replica(s) to 
+    # quickly take over. This is particularly important in the high availability set up with leader
+    # election, as the rolling update strategy would prevent the old leader being killed because 
+    # otherwise the maxUnavailable would be 100%.
+    type: Recreate
   selector:
     matchLabels:
       {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }}
@@ -33,10 +45,6 @@ spec:
         - "json"
         - --config-file
         - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
-        {{- range .Values.inferenceExtension.flags }}
-        - "--{{ .name }}"
-        - "{{ .value }}"
-        {{- end }}
         {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
         - --total-queued-requests-metric
         - "nv_trt_llm_request_metrics{request_type=waiting}"
@@ -45,6 +53,14 @@ spec:
         - --lora-info-metric
         - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet.
         {{- end }}
+        {{- if .Values.inferenceExtension.enableLeaderElection }}
+        - --ha-enable-leader-election
+        {{- end }}
+        # Pass additional flags via the inferenceExtension.flags field in values.yaml.
+        {{- range .Values.inferenceExtension.flags }}
+        - "--{{ .name }}"
+        - "{{ .value }}"
+        {{- end }}
         ports:
         - name: grpc
           containerPort: 9002
@@ -77,8 +93,8 @@ spec:
             port: 9003
             service: inference-extension
           {{- end }}
-          initialDelaySeconds: 5
-          periodSeconds: 10
+          periodSeconds: 2
+
         env:
         - name: NAMESPACE
           valueFrom:
diff --git a/config/charts/inferencepool/templates/gke.yaml b/config/charts/inferencepool/templates/gke.yaml
index 59e186a94..a0118f5f4 100644
--- a/config/charts/inferencepool/templates/gke.yaml
+++ b/config/charts/inferencepool/templates/gke.yaml
@@ -13,6 +13,10 @@ spec:
     kind: InferencePool
     name: {{ .Release.Name }}
   default:
+    # Set a more aggressive health check than the default 5s for faster switch
+    # over during EPP rollout.
+    timeoutSec: 2
+    checkIntervalSec: 2
     config:
       type: HTTP
       httpHealthCheck: