docs: Add leader election section in helm chart README

yangligt2 · yangligt2 · commit a0bb0f9138b5 · 2025-08-11T19:11:48.000Z
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
@@ -79,6 +79,24 @@ $ helm install triton-llama3-8b-instruct \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
 ```
 
+### Install with High Availability (HA)
+
+To deploy the EndpointPicker in a high-availability (HA) active-passive configuration, you can enable leader election. When enabled, the EPP deployment will have multiple replicas, but only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity.
+
+To enable HA, set `inferenceExtension.enableLeaderElection` to `true` and increase the number of replicas in your `values.yaml` file:
+
+```yaml
+inferenceExtension:
+  replicas: 3
+  enableLeaderElection: true
+```
+
+Then apply it with:
+
+```txt
+helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml \
+```
+
 ## Uninstall
 
 Run the following command to uninstall the chart:
@@ -107,6 +125,8 @@ The following table list the configurable parameters of the chart.
 | `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                         |
 | `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                   |
 | `provider.name`                             | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`.                   |
+| `inferenceExtension.enableLeaderElection`   | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic. It is recommended to set `inferenceExtension.replicas` to a value greater than 1 when this is set to `true`. Defaults to `false`. |
+
 
 ## Notes
 
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -34,6 +34,8 @@ inferenceExtension:
   extraContainerPorts: []
   # Define additional service ports
   extraServicePorts: []
+  # Enable leader election for high availability. When enabled, it is recommended to set replicas > 1.
+  # Only the leader pod will be ready to serve traffic.
   enableLeaderElection: false
 
 inferencePool:
diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go
@@ -72,7 +72,7 @@ var _ = ginkgo.Describe("InferencePool", func() {
 
 	ginkgo.When("The Inference Extension is running", func() {
 		ginkgo.It("Should route traffic to target model servers", func() {
-			verifyTrafficRouting(infObjective)
+			verifyTrafficRouting()
 		})
 
 		ginkgo.It("Should expose EPP metrics after generating traffic", func() {
@@ -113,7 +113,7 @@ var _ = ginkgo.Describe("InferencePool", func() {
 			}
 
 			ginkgo.By("STEP 1: Verifying initial leader is working correctly before failover")
-			verifyTrafficRouting(infObjective)
+			verifyTrafficRouting()
 			verifyMetrics()
 
 			ginkgo.By("STEP 2: Finding and deleting the current leader pod")
@@ -156,7 +156,7 @@ var _ = ginkgo.Describe("InferencePool", func() {
 			ginkgo.By("Found new leader pod: " + newLeaderPod.Name)
 
 			ginkgo.By("STEP 5: Verifying the new leader is working correctly after failover")
-			verifyTrafficRouting(infObjective)
+			verifyTrafficRouting()
 			verifyMetrics()
 		})
 	})
@@ -171,7 +171,7 @@ func newInferenceObjective(ns string) *v1alpha2.InferenceObjective {
 }
 
 // verifyTrafficRouting contains the logic for the "Should route traffic to target model servers" test.
-func verifyTrafficRouting(infObjective *v1alpha2.InferenceObjective) {
+func verifyTrafficRouting() {
 	ginkgo.By("Verifying traffic routing")
 	for _, t := range []struct {
 		api              string