Ensure EPP flags are configurable via Helm chart (#1302)

rahulgurnani · web-flow · commit 137a0b4660b9 · 2025-08-21T08:11:12.000-07:00
* Ensure EPP flags are configurable via Helm chart

* Cleanup and remove customvalues

* Address review comments

* Rename eppFlags to flags

* Fix spacing
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
@@ -119,11 +119,10 @@ The following table list the configurable parameters of the chart.
 | `inferenceExtension.image.hub`              | Registry URL where the endpoint picker image is hosted.                                                                |
 | `inferenceExtension.image.tag`              | Image tag of the endpoint picker.                                                                                      |
 | `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`.      |
-| `inferenceExtension.extProcPort`            | Port where the endpoint picker service is served for external processing. Defaults to `9002`.                          |
 | `inferenceExtension.env`                    | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`.             |
-| `inferenceExtension.extraContainerPorts`    | List of additional container ports to expose. Defaults to `[]`.                                                       |
-| `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                         |
-| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                   |
+| `inferenceExtension.extraContainerPorts`    | List of additional container ports to expose. Defaults to `[]`.                                                        |
+| `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                          |
+| `inferenceExtension.flags`                  | List of flags which are passed through to endpoint picker.                                                             |
 | `provider.name`                             | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`.                   |
 | `inferenceExtension.enableLeaderElection`   | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic. It is recommended to set `inferenceExtension.replicas` to a value greater than 1 when this is set to `true`. Defaults to `false`. |
 
diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
@@ -27,25 +27,10 @@ spec:
         - {{ .Release.Name }}
         - --pool-namespace
         - {{ .Release.Namespace }}
-        - --v
-        - "{{ .Values.inferenceExtension.logVerbosity | default "3" }}"
-        - --grpc-port
-        - "9002"
-        - --grpc-health-port
-        - "9003"
         - --zap-encoder
         - "json"
-        - --metrics-port
-        - "9090"
-        - --config-file
-        - "config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
-        # https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
-        - "--enable-pprof={{ .Values.inferenceExtension.enablePprof }}"
-        - "--model-server-metrics-path={{ .Values.inferenceExtension.modelServerMetricsPath }}"
-        - "--model-server-metrics-scheme={{ .Values.inferenceExtension.modelServerMetricsScheme }}"
-        - "--model-server-metrics-https-insecure-skip-verify={{ .Values.inferenceExtension.modelServerMetricsHttpsInsecureSkipVerify }}"
-        {{- if .Values.inferenceExtension.enableLeaderElection }}
-        - "--ha-enable-leader-election"
+        {{- range .Values.inferenceExtension.flags }}
+        - "--{{ .name }}={{ .value }}"
         {{- end }}
         {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
         - --total-queued-requests-metric
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -7,12 +7,13 @@ inferenceExtension:
     pullPolicy: Always
   extProcPort: 9002
   env: []
-  enablePprof: true # Enable pprof handlers for profiling and debugging
-  modelServerMetricsPath: "/metrics"
-  modelServerMetricsScheme: "http"
-  modelServerMetricsHttpsInsecureSkipVerify: true
-  # This is the plugins configuration file. 
   pluginsConfigFile: "default-plugins.yaml"
+  # Define additional container ports
+  extraContainerPorts: []
+  # Define additional service ports
+  extraServicePorts: []
+
+  # This is the plugins configuration file.
   # pluginsCustomConfig:
   #   custom-plugins.yaml: |
   #     apiVersion: inference.networking.x-k8s.io/v1alpha1
@@ -30,19 +31,58 @@ inferenceExtension:
   # env:
   #   ENABLE_EXPERIMENTAL_FEATURE: "true"
 
-  # Define additional container ports
-  extraContainerPorts: []
-  # Define additional service ports
-  extraServicePorts: []
-  # Enable leader election for high availability. When enabled, it is recommended to set replicas > 1.
-  # Only the leader pod will be ready to serve traffic.
-  enableLeaderElection: false
+  flags:
+    - name: grpc-port
+      value: 9002
+    - name: grpc-health-port
+      value: 9003
+    - name: metrics-port
+      value: 9090
+    - name: enable-pprof
+      value: "true"  # Enable pprof handlers for profiling and debugging
+    - name: pool-group
+      value: "inference.networking.k8s.io"
+    # Log verbosity
+    - name: v
+      value: 1
+    - name: secure-serving
+      value: "true"
+    - name: health-checking
+      value: "false"
+    - name: cert-path
+      value: ""
+    - name: total-queued-requests-metric
+      value: "vllm:num_requests_waiting"
+    - name: kv-cache-usage-percentage-metric
+      value: "vllm:gpu_cache_usage_perc"
+    - name: lora-info-metric
+      value: "vllm:lora_requests_info"
+    - name: refresh-metrics-interval
+      value: "50ms"
+    - name: refresh-prometheus-metrics-interval
+      value: "5s"
+    - name: metrics-staleness-threshold
+      value: "2s"
+    - name: config-file
+      value: ""
+    - name: config-text
+      value: ""
+    - name: model-server-metrics-port
+      value: 0
+    - name: model-server-metrics-path
+      value: "/metrics"
+    - name: model-server-metrics-scheme
+      value: "http"
+    - name: model-server-metrics-https-insecure-skip-verify
+      value: "true"
+    - name: has-enable-leader-election
+      value: false
 
 inferencePool:
   targetPorts:
     - number: 8000
   modelServerType: vllm # vllm, triton-tensorrt-llm
-  modelServers: # REQUIRED
+  modelServers:
     matchLabels:
       app: vllm-llama3-8b-instruct