Promote plugin v2 config to be the default (#1290)

liu-cong · web-flow · commit ce55fe6498cd · 2025-08-05T12:19:25.000-07:00
* Promote plugin v2 config to be the default

* Remove max scorer picker from the config as it is the default.

* More places to clean up references to the legacy filters

* Remove lora-affinity-scorer from the list as it is not used by default

* Use minimal config in more examples
diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml
@@ -5,80 +5,20 @@ metadata:
   namespace: {{ .Release.Namespace }}
 data:
   default-plugins.yaml: |
-    apiVersion: inference.networking.x-k8s.io/v1alpha1
-    kind: EndpointPickerConfig
-    plugins:
-    - type: low-queue-filter
-      parameters:
-        threshold: 128
-    - type: lora-affinity-filter
-      parameters:
-        threshold: 0.999
-    - type: least-queue-filter
-    - type: least-kv-cache-filter
-    - type: decision-tree-filter
-      name: low-latency-filter
-      parameters:
-        current:
-          pluginRef: low-queue-filter
-        nextOnSuccess:
-          decisionTree:
-            current:
-              pluginRef: lora-affinity-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: least-queue-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-        nextOnFailure:
-          decisionTree:
-            current:
-              pluginRef: least-queue-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: lora-affinity-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-    - type: random-picker
-      parameters:
-        maxNumOfEndpoints: 1
-    - type: single-profile-handler
-    schedulingProfiles:
-    - name: default
-      plugins:
-      - pluginRef: low-latency-filter
-      - pluginRef: random-picker
-  plugins-v2.yaml: |
     apiVersion: inference.networking.x-k8s.io/v1alpha1
     kind: EndpointPickerConfig
     plugins:
     - type: queue-scorer
     - type: kv-cache-utilization-scorer
     - type: prefix-cache-scorer
-      parameters:
-        hashBlockSize: 64
-        maxPrefixBlocksToMatch: 256
-        lruCapacityPerServer: 31250
     - type: max-score-picker
-      parameters:
-        maxNumOfEndpoints: 1
     - type: single-profile-handler
     schedulingProfiles:
     - name: default
       plugins:
       - pluginRef: queue-scorer
-        weight: 1
       - pluginRef: kv-cache-utilization-scorer
-        weight: 1
       - pluginRef: prefix-cache-scorer
-        weight: 1
-      - pluginRef: max-score-picker
   {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
   {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
   {{- end }}
diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml
@@ -106,80 +106,20 @@ metadata:
   namespace: default
 data:
   default-plugins.yaml: |
-    apiVersion: inference.networking.x-k8s.io/v1alpha1
-    kind: EndpointPickerConfig
-    plugins:
-    - type: low-queue-filter
-      parameters:
-        threshold: 128
-    - type: lora-affinity-filter
-      parameters:
-        threshold: 0.999
-    - type: least-queue-filter
-    - type: least-kv-cache-filter
-    - type: decision-tree-filter
-      name: low-latency-filter
-      parameters:
-        current:
-          pluginRef: low-queue-filter
-        nextOnSuccess:
-          decisionTree:
-            current:
-              pluginRef: lora-affinity-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: least-queue-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-        nextOnFailure:
-          decisionTree:
-            current:
-              pluginRef: least-queue-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: lora-affinity-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-    - type: random-picker
-      parameters:
-        maxNumOfEndpoints: 1
-    - type: single-profile-handler
-    schedulingProfiles:
-    - name: default
-      plugins:
-      - pluginRef: low-latency-filter
-      - pluginRef: random-picker
-  plugins-v2.yaml: |
     apiVersion: inference.networking.x-k8s.io/v1alpha1
     kind: EndpointPickerConfig
     plugins:
     - type: queue-scorer
     - type: kv-cache-utilization-scorer
     - type: prefix-cache-scorer
-      parameters:
-        hashBlockSize: 64
-        maxPrefixBlocksToMatch: 256
-        lruCapacityPerServer: 31250
     - type: max-score-picker
-      parameters:
-        maxNumOfEndpoints: 1
     - type: single-profile-handler
     schedulingProfiles:
     - name: default
       plugins:
       - pluginRef: queue-scorer
-        weight: 1
       - pluginRef: kv-cache-utilization-scorer
-        weight: 1
       - pluginRef: prefix-cache-scorer
-        weight: 1
-      - pluginRef: max-score-picker
 ---
 kind: Role
 apiVersion: rbac.authorization.k8s.io/v1
diff --git a/site-src/guides/epp-configuration/config-text.md b/site-src/guides/epp-configuration/config-text.md
@@ -219,7 +219,7 @@ Filters out pods who's waiting queue size is greater than the specified theshold
 - *Parameters*:
   - `threshold` the waiting queue threshold. If not specified defaults to `128`
 
-#### **PrefixCachePlugin**
+#### **PrefixCacheScorer**
 
 Scores pods based on the amount of the prompt is believed to be in the pod's KvCache.
 
@@ -232,19 +232,32 @@ Scores pods based on the amount of the prompt is believed to be in the pod's KvC
   - `lruCapacityPerServer` specifies the capacity of the LRU indexer in number of entries
     per server (pod). If not specified defaults to `31250`
 
+#### **LoRAAffinityScorer**
+
+Scores pods based on whether the requested LoRA adapter is already loaded in the pod's HBM, or if
+the pod is ready to load the LoRA on demand.
+
+- *Type*: lora-affinity-scorer
+- *Parameters*: none
+
 #### **MaxScorePicker**
 
-Picks the pod with the maximum score from the list of candidates.
+Picks the pod with the maximum score from the list of candidates. This is the default picker plugin
+if not specified.
 
 - *Type*: max-score-picker
-- *Parameters*: none
+- *Parameters*: 
+  - `maxNumOfEndpoints`: Maximum number of endpoints to pick from the list of candidates, based on
+    the scores of those endpoints. If not specified defaults to `1`.
 
 #### **RandomPicker**
 
 Picks a random pod from the list of candidates.
 
 - *Type*: random-picker
-- *Parameters*: none
+- *Parameters*: 
+  - `maxNumOfEndpoints`: Maximum number of endpoints to pick from the list of candidates. If not
+    specified defaults to `1`.
 
 #### **KvCacheScorer**
 
diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md
@@ -8,35 +8,24 @@ and queue depth.
 
 ## Enable the prefix cache plugin
 
-Currently prefix cache aware plugin is implemented in the V2 scheduler as an experimental feature.
-To enable it, set the following environment variables when starting the EndpointPicker(EPP).
-
-```
-EXPERIMENTAL_USE_SCHEDULER_V2: true
-ENABLE_PREFIX_CACHE_SCHEDULING: true
-```
-
-See the [Use Helm section](#helm) to install an inferencepool with the environment variables.
-
+Like any other plugins, the prefix cache aware plugin can be enabled/disabled via the [plugin config file](config-text.md), and is enabled in the [default configuration](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/templates/epp-config.yaml).
 
 ## Customize the prefix cache plugin
 
-The prefix cache plugin exposes the following advanced configuration options via environment variables:
+The prefix cache plugin exposes the following advanced configuration parameters:
 
-* `PREFIX_CACHE_HASH_BLOCK_SIZE`: The plugin matches prefixes in the unit of blocks. This is the size
+* `hashBlockSize`: The plugin matches prefixes in the unit of blocks. This is the size
 of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default
 is set to 64 in EPP. The default is recommended unless performance is critical for use cases with
 extremely long inputs.
 
-* `PREFIX_CACHE_MAX_PREFIX_BLOCKS`: The maximum number of blocks to find prefix match. The default is
-128 (or 128*64=8192 characters, or roughly 2048 tokens). This is useful to tradeoff prefix match accuracy
+* `maxPrefixBlocksToMatch`: The maximum number of blocks to find prefix match. The default is
+256 (or 256*64=16384 characters, or roughly 4096 tokens). This is useful to tradeoff prefix match accuracy
 for performance.
 
-* `PREFIX_CACHE_LRU_CAPACITY_PER_SERVER`: Maximum capacity the prefix LRU cache in number of block hashes per server (pod). Below
+* `lruCapacityPerServer`: Maximum capacity the prefix LRU cache in number of block hashes per server (pod). Below
 shows a detailed analysis on how to estimate this.
 
-
-
     The prefix cache plugin estimates the prefix cache indexes in model server HBMs.  In the perfect
     scenario, EPP has the exact same prefix cache entries per model server as their HBM cache entries. If
     the EPP cache is smaller than HBM cache, a positive EPP cache match is more accurate, but there are more
@@ -65,25 +54,4 @@ shows a detailed analysis on how to estimate this.
     # assume avg_chars_per_token = 4, prefix_indexer_hash_block_size = 64 (default)
     # each entry is about 358KB, so the memory footrpint is abut 11 MB per server
     lru_indexer_capacity_per_server = 500,000*4/64 = 31250
-    ```
-
-See the [Use Helm section](#helm) to install an inferencepool with the environment variables.
-
-
-<a id="helm"></a>
-## Use Helm
-
-Use the following reference command to install an inferencepool with the prefix
-cache plugin environment variable configurations:
-
-```txt
-$ helm install vllm-llama3-8b-instruct \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
-  --set inferencePool.modelServerType=vllm \
-  --set provider.name=[none|gke] \
-  --set inferenceExtension.env.EXPERIMENTAL_USE_SCHEDULER_V2=true \
-  --set inferenceExtension.env.ENABLE_PREFIX_CACHE_SCHEDULING=true \
-  --set inferenceExtension.env.PREFIX_CACHE_LRU_CAPACITY_PER_SERVER=31250 \
-  --set inferenceExtension.env.PREFIX_CACHE_MAX_PREFIX_BLOCKS=1024 \
-  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
-```
+    ```
diff --git a/site-src/guides/inferencepool-rollout.md b/site-src/guides/inferencepool-rollout.md
@@ -301,80 +301,20 @@ metadata:
   namespace: default
 data:
   default-plugins.yaml: |
-    apiVersion: inference.networking.x-k8s.io/v1alpha1
-    kind: EndpointPickerConfig
-    plugins:
-    - type: low-queue-filter
-      parameters:
-        threshold: 128
-    - type: lora-affinity-filter
-      parameters:
-        threshold: 0.999
-    - type: least-queue-filter
-    - type: least-kv-cache-filter
-    - type: decision-tree-filter
-      name: low-latency-filter
-      parameters:
-        current:
-          pluginRef: low-queue-filter
-        nextOnSuccess:
-          decisionTree:
-            current:
-              pluginRef: lora-affinity-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: least-queue-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-        nextOnFailure:
-          decisionTree:
-            current:
-              pluginRef: least-queue-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: lora-affinity-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-    - type: random-picker
-      parameters:
-        maxNumOfEndpoints: 1
-    - type: single-profile-handler
-    schedulingProfiles:
-    - name: default
-      plugins:
-      - pluginRef: low-latency-filter
-      - pluginRef: random-picker
-  plugins-v2.yaml: |
     apiVersion: inference.networking.x-k8s.io/v1alpha1
     kind: EndpointPickerConfig
     plugins:
     - type: queue-scorer
     - type: kv-cache-utilization-scorer
     - type: prefix-cache-scorer
-      parameters:
-        hashBlockSize: 64
-        maxPrefixBlocksToMatch: 256
-        lruCapacityPerServer: 31250
     - type: max-score-picker
-      parameters:
-        maxNumOfEndpoints: 1
     - type: single-profile-handler
     schedulingProfiles:
     - name: default
       plugins:
       - pluginRef: queue-scorer
-        weight: 1
       - pluginRef: kv-cache-utilization-scorer
-        weight: 1
       - pluginRef: prefix-cache-scorer
-        weight: 1
-      - pluginRef: max-score-picker
 EOF
 ```
 
diff --git a/test/testdata/inferencepool-e2e.yaml b/test/testdata/inferencepool-e2e.yaml