kubernetes-sigs · rahulgurnani · Aug 4, 2025 · Aug 5, 2025 · Aug 8, 2025 · kfswain
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
@@ -103,9 +103,21 @@ The following table list the configurable parameters of the chart.
 | `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`.      |
 | `inferenceExtension.extProcPort`            | Port where the endpoint picker service is served for external processing. Defaults to `9002`.                          |
 | `inferenceExtension.env`                    | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`.             |
-| `inferenceExtension.extraContainerPorts`    | List of additional container ports to expose. Defaults to `[]`.                                                       |
-| `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                         |
-| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                   |
+| `inferenceExtension.enablePprof`            | Enables pprof for profiling and debugging                                                                       |
+| `inferenceExtension.modelServerMetricsPath` | Flag to have model server metrics                                                                                      |
+| `inferenceExtension.modelServerMetricsScheme` | Flag to have model server metrics scheme                                                                             |
+| `inferenceExtension.modelServerMetricsPort` | Flag for have model server metrics port                 |
+| `inferenceExtension.modelServerMetricsHttpsInsecureSkipVerify` | When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to true) |
+| `inferenceExtension.secureServing`          | Enables secure serving. Defaults to true.                              |
+| `inferenceExtension.healthChecking`         | Enables health checking                                   |
+| `inferenceExtension.certPath`               | The path to the certificate for secure serving. The certificate and private key files are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, then a self-signed certificate is used.            |
+| `inferenceExtension.refreshMetricsInterval` |  Interval to refresh metrics    |
+| `inferenceExtension.refreshPrometheusMetricsInterval` | Interval to flush prometheus metrics      |
+| `inferenceExtension.metricsStalenessThreshold`    | Duration after which metrics are considered stale. This is used to determine if a pod's metrics are fresh enough.  |
+| `inferenceExtension.totalQueuedRequestsMetric`   | Prometheus metric for the number of queued requests.      |
+| `inferenceExtension.extraContainerPorts`    | List of additional container ports to expose. Defaults to `[]`.                                                        |
+| `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                          |
+| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                    |
-| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                    |
+| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"1"`.                                                    |
-| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                    |
+| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"1"`.                                                    |
 | `provider.name`                             | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`.                   |
 
 ## Notes

diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
@@ -44,6 +44,16 @@ spec:
         - "--model-server-metrics-path={{ .Values.inferenceExtension.modelServerMetricsPath }}"
         - "--model-server-metrics-scheme={{ .Values.inferenceExtension.modelServerMetricsScheme }}"
         - "--model-server-metrics-https-insecure-skip-verify={{ .Values.inferenceExtension.modelServerMetricsHttpsInsecureSkipVerify }}"
+        - "--model-server-metrics-port={{ .Values.inferenceExtension.modelServerMetricsPort }}"
+        - "--secure-serving={{ .Values.inferenceExtension.secureServing }}"
+        - "--health-checking={{ .Values.inferenceExtension.healthChecking }}"
+        - "--cert-path={{ .Values.inferenceExtension.certPath }}"
+        - "--total-queued-requests-metric={{ .Values.inferenceExtension.totalQueuedRequestsMetric }}"
+        - "--kv-cache-usage-percentage-metric={{ .Values.inferenceExtension.kvCacheUsagePercentageMetric }}"
+        - "--lora-info-metric={{ .Values.inferenceExtension.loraInfoMetric }}"
+        - "--refresh-metrics-interval={{ .Values.inferenceExtension.refreshMetricsInterval }}"
+        - "--refresh-prometheus-metrics-interval={{ .Values.inferenceExtension.refreshPrometheusMetricsInterval }}"
+        - "--metrics-staleness-threshold={{ .Values.inferenceExtension.metricsStalenessThreshold }}"
         {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
         - --total-queued-requests-metric
         - "nv_trt_llm_request_metrics{request_type=waiting}"

diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -1,4 +1,5 @@
 inferenceExtension:
+  # Number of replicas
   replicas: 1
   image:
     name: epp
@@ -7,12 +8,29 @@ inferenceExtension:
     pullPolicy: Always
   extProcPort: 9002
   env: []
-  enablePprof: true # Enable pprof handlers for profiling and debugging
+  enablePprof: true  # Enable pprof handlers for profiling and debugging
   modelServerMetricsPath: "/metrics"
   modelServerMetricsScheme: "http"
   modelServerMetricsHttpsInsecureSkipVerify: true
-  # This is the plugins configuration file. 
+  grpcPort: 9002
+  grpcHealthPort: 9003
+  metricsPort: 9090
+  poolName: ""
+  poolNamespace: "default"
+  refreshMetricsInterval: "50ms"
+  refreshPrometheusMetricsInterval: "5s"
+  secureServing: true
+  healthChecking: false
+  totalQueuedRequestsMetric: "vllm:num_requests_waiting"
+  kvCacheUsagePercentageMetric: "vllm:gpu_cache_usage_perc"
+  loraInfoMetric: "vllm:lora_requests_info"
+  certPath: ""
+  metricsStalenessThreshold: "2s"
+
   pluginsConfigFile: "default-plugins.yaml"
+  logVerbosity: 1
+
+  # This is the plugins configuration file.
   # pluginsCustomConfig:
   #   custom-plugins.yaml: |
   #     apiVersion: inference.networking.x-k8s.io/v1alpha1
@@ -34,18 +52,18 @@ inferenceExtension:
   # Example environment variables:
   # env:
   #   KV_CACHE_SCORE_WEIGHT: "1"
-
   # Define additional container ports
+  modelServerMetricsPort: 0
   extraContainerPorts: []
   # Define additional service ports
   extraServicePorts: []
 
 inferencePool:
   targetPortNumber: 8000
   modelServerType: vllm # vllm, triton-tensorrt-llm
-  # modelServers: # REQUIRED
-    # matchLabels: 
-    #   app: vllm-llama3-8b-instruct
+  # modelServers:
+    # matchLabels:
+      # app: vllm-llama3-8b-instruct
 
 provider:
   name: none