Add helm values and polish README and SLO routing guide

BenjaminBraunDev · BenjaminBraunDev · commit f63bc019dd8b · 2025-11-20T21:40:05.000Z
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
@@ -132,8 +132,10 @@ Here is an example of how to install the chart with SLO-aware routing enabled:
 ```txt
 $ helm install vllm-llama3-8b-instruct . \
   --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+  --set inferenceExtension.monitoring.gke.enabled=true \
   --set inferenceExtension.latencyPredictor.enabled=true \
-  --set provider.name=gke
+  --set provider.name=gke \
+  -f values.yaml
 ```
 
 #### SLO-Aware Router Environment Variables
@@ -150,14 +152,6 @@ The behavior of the SLO-aware router can be fine-tuned using the following envir
 | `HEADROOM_TTFT_WEIGHT`           | The weight to give to the TTFT when a pod has positive headroom.                                        | `0.8`       |
 | `HEADROOM_TPOT_WEIGHT`           | The weight to give to the TPOT when a pod has positive headroom.                                        | `0.2`       |
 | `HEADROOM_SELECTION_STRATEGY`    | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least`     |
-| `COMPOSITE_KV_WEIGHT`            | The weight to give to the KV cache utilization in the composite score.                                  | `1`         |
-| `COMPOSITE_QUEUE_WEIGHT`         | The weight to give to the queue size in the composite score.                                            | `1`         |
-| `COMPOSITE_PREFIX_WEIGHT`        | The weight to give to the prefix cache score in the composite score.                                    | `1`         |
-| `STICKY_EPSILON`                 | The probability of exploring a non-sticky pod.                                                          | `0.01`      |
-| `NEG_HEADROOM_EPSILON`           | The probability of exploring a pod with negative headroom.                                              | `0.01`      |
-| `AFFINITY_GATE_TAU`              | The stickiness threshold for the affinity gate.                                                         | `0.80`      |
-| `AFFINITY_GATE_TAU_GLOBAL`       | The global stickiness threshold for the affinity gate.                                                  | `0.99`      |
-| `POD_SELECTION_MODE`             | The mode for selecting a pod from the weighted list. Options: `linear` (weighted random), `max` (argmax). | `linear`    |
 
 **Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more.
 
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -1,9 +1,9 @@
 inferenceExtension:
   replicas: 1
   image:
-    name: epp
-    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
-    tag: main
+    name: epp-wlp-latencypredictor-helm-v2
+    hub:  us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo
+    tag: latest
     pullPolicy: Always
   extProcPort: 9002
   env: []
@@ -12,11 +12,6 @@ inferenceExtension:
   extraContainerPorts: []
   # Define additional service ports
   extraServicePorts: []
-#  extraServicePorts:
-#    - name: http
-#      port: 8081
-#      protocol: TCP
-#      targetPort: 8081
 
   # This is the plugins configuration file.
   # pluginsCustomConfig:
@@ -43,10 +38,6 @@ inferenceExtension:
   affinity: {}
 
   tolerations: []
-  
-  # Sidecar configuration for EPP
-  sidecar:
-    enabled: false
 
   # Monitoring configuration for EPP
   monitoring:
@@ -71,6 +62,89 @@ inferenceExtension:
       sampler: "parentbased_traceidratio"
       samplerArg: "0.1"
 
+  # Latency Predictor Configuration
+  latencyPredictor:
+    enabled: false
+    
+    # Training Server Configuration
+    trainingServer:
+      image:
+        hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo
+        name: latencypredictor-v3-training-server
+        tag: latest
+        pullPolicy: Always
+      port: 8000
+      resources:
+        requests:
+          cpu: "2000m"
+          memory: "4Gi"
+        limits:
+          cpu: "4000m"
+          memory: "8Gi"
+      livenessProbe:
+        httpGet:
+          path: /healthz
+          port: 8000
+        initialDelaySeconds: 30
+        periodSeconds: 20
+      readinessProbe:
+        httpGet:
+          path: /readyz
+          port: 8000
+        initialDelaySeconds: 45
+        periodSeconds: 10
+      volumeSize: "20Gi"
+      config:
+        LATENCY_RETRAINING_INTERVAL_SEC: "1"
+        LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100"
+        LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
+        LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
+        LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
+        LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
+        LATENCY_MODEL_TYPE: "xgboost"
+        LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000"
+        LATENCY_QUANTILE_ALPHA: "0.9"
+
+    # Prediction Server Configuration
+    predictionServers:
+      count: 10
+      startPort: 8001
+      image:
+        hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo
+        name: latencypredictor-v3-prediction-server
+        tag: latest
+        pullPolicy: Always
+      resources:
+        requests:
+          cpu: "500m"
+          memory: "1Gi"
+        limits:
+          cpu: "1000m"
+          memory: "2Gi"
+      livenessProbe:
+        httpGet:
+          path: /healthz
+        initialDelaySeconds: 15
+        periodSeconds: 15
+      readinessProbe:
+        httpGet:
+          path: /readyz
+        initialDelaySeconds: 10
+        periodSeconds: 5
+        failureThreshold: 10
+      volumeSize: "10Gi"
+      config:
+        LATENCY_MODEL_TYPE: "xgboost"
+        PREDICT_HOST: "0.0.0.0"
+        LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib"
+        LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
+        LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
+        LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"
+
+    # EPP Environment Variables for Latency Predictor
+    eppEnv:
+      LATENCY_MAX_SAMPLE_SIZE: "10000"
+
 inferencePool:
   targetPorts:
     - number: 8000
@@ -94,25 +168,12 @@ provider:
     # Set to true if the cluster is an Autopilot cluster.
     autopilot: false
 
-  # Istio-specific configuration.
-  # This block is only used if name is "istio".
-  istio:
-    destinationRule:
-      # Provide a way to override the default calculated host
-      host: ""
-      # Optional: Enables customization of the traffic policy
-      trafficPolicy: {}
-        # connectionPool:
-        #   http:
-        #     maxRequestsPerConnection: 256000
-
-# DEPRECATED and will be removed in v1.3. Instead, use `provider.istio.*`.
 istio:
   destinationRule:
     # Provide a way to override the default calculated host
-    host: ""
+    host: "" 
     # Optional: Enables customization of the traffic policy
     trafficPolicy: {}
       # connectionPool:
       #   http:
-      #     maxRequestsPerConnection: 256000
+      #     maxRequestsPerConnection: 256000
diff --git a/site-src/guides/slo-aware-routing.md b/site-src/guides/slo-aware-routing.md
@@ -61,6 +61,8 @@ Key categories of metrics include:
 -   **SLO Violations**: Counters and gauges are available to track when SLOs are violated. This can be used to alert on SLO breaches.
 -   **SLO Thresholds**: The current SLO thresholds for TTFT and TPOT are also exposed as metrics.
 
+NOTE: TPOT is equivalen to vLLM's **ITL** (Inter Token Latency), as vLLM defines TPOT as the average time per output token *including the TTFT*. This is commonly known as NTPOT in other contexts, and we don't capture that metric here.
+
 The following is a comprehensive list of the Prometheus metrics exposed:
 
 | Metric Name                                                | Description                                                                                                      |
@@ -81,5 +83,3 @@ The following is a comprehensive list of the Prometheus metrics exposed:
 | `inference_objective_request_ttft_slo_violation_total`     | Counter of TTFT SLO violations for each model and target model.                                                  |
 | `inference_objective_request_tpot_slo_violation`           | Boolean indicator (0 or 1) of whether the last TPOT measurement violated the SLO threshold for each model and target model. |
 | `inference_objective_request_tpot_slo_violation_total`     | Counter of TPOT SLO violations for each model and target model.                                                  |
-| `inference_objective_request_ttft_slo_threshold_seconds`   | Current TTFT SLO threshold in seconds for each model and target model.                                           |
-| `inference_objective_request_tpot_slo_threshold_seconds`   | Current TPOT SLO threshold in seconds for each model and target model.                                           |