Skip to content

Commit ce55fe6

Browse files
authored
Promote plugin v2 config to be the default (#1290)
* Promote plugin v2 config to be the default * Remove max scorer picker from the config as it is the default. * More places to clean up references to the legacy filters * Remove lora-affinity-scorer from the list as it is not used by default * Use minimal config in more examples
1 parent a630637 commit ce55fe6

File tree

6 files changed

+24
-283
lines changed

6 files changed

+24
-283
lines changed

config/charts/inferencepool/templates/epp-config.yaml

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,80 +5,20 @@ metadata:
55
namespace: {{ .Release.Namespace }}
66
data:
77
default-plugins.yaml: |
8-
apiVersion: inference.networking.x-k8s.io/v1alpha1
9-
kind: EndpointPickerConfig
10-
plugins:
11-
- type: low-queue-filter
12-
parameters:
13-
threshold: 128
14-
- type: lora-affinity-filter
15-
parameters:
16-
threshold: 0.999
17-
- type: least-queue-filter
18-
- type: least-kv-cache-filter
19-
- type: decision-tree-filter
20-
name: low-latency-filter
21-
parameters:
22-
current:
23-
pluginRef: low-queue-filter
24-
nextOnSuccess:
25-
decisionTree:
26-
current:
27-
pluginRef: lora-affinity-filter
28-
nextOnSuccessOrFailure:
29-
decisionTree:
30-
current:
31-
pluginRef: least-queue-filter
32-
nextOnSuccessOrFailure:
33-
decisionTree:
34-
current:
35-
pluginRef: least-kv-cache-filter
36-
nextOnFailure:
37-
decisionTree:
38-
current:
39-
pluginRef: least-queue-filter
40-
nextOnSuccessOrFailure:
41-
decisionTree:
42-
current:
43-
pluginRef: lora-affinity-filter
44-
nextOnSuccessOrFailure:
45-
decisionTree:
46-
current:
47-
pluginRef: least-kv-cache-filter
48-
- type: random-picker
49-
parameters:
50-
maxNumOfEndpoints: 1
51-
- type: single-profile-handler
52-
schedulingProfiles:
53-
- name: default
54-
plugins:
55-
- pluginRef: low-latency-filter
56-
- pluginRef: random-picker
57-
plugins-v2.yaml: |
588
apiVersion: inference.networking.x-k8s.io/v1alpha1
599
kind: EndpointPickerConfig
6010
plugins:
6111
- type: queue-scorer
6212
- type: kv-cache-utilization-scorer
6313
- type: prefix-cache-scorer
64-
parameters:
65-
hashBlockSize: 64
66-
maxPrefixBlocksToMatch: 256
67-
lruCapacityPerServer: 31250
6814
- type: max-score-picker
69-
parameters:
70-
maxNumOfEndpoints: 1
7115
- type: single-profile-handler
7216
schedulingProfiles:
7317
- name: default
7418
plugins:
7519
- pluginRef: queue-scorer
76-
weight: 1
7720
- pluginRef: kv-cache-utilization-scorer
78-
weight: 1
7921
- pluginRef: prefix-cache-scorer
80-
weight: 1
81-
- pluginRef: max-score-picker
8222
{{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
8323
{{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
8424
{{- end }}

config/manifests/inferencepool-resources.yaml

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -106,80 +106,20 @@ metadata:
106106
namespace: default
107107
data:
108108
default-plugins.yaml: |
109-
apiVersion: inference.networking.x-k8s.io/v1alpha1
110-
kind: EndpointPickerConfig
111-
plugins:
112-
- type: low-queue-filter
113-
parameters:
114-
threshold: 128
115-
- type: lora-affinity-filter
116-
parameters:
117-
threshold: 0.999
118-
- type: least-queue-filter
119-
- type: least-kv-cache-filter
120-
- type: decision-tree-filter
121-
name: low-latency-filter
122-
parameters:
123-
current:
124-
pluginRef: low-queue-filter
125-
nextOnSuccess:
126-
decisionTree:
127-
current:
128-
pluginRef: lora-affinity-filter
129-
nextOnSuccessOrFailure:
130-
decisionTree:
131-
current:
132-
pluginRef: least-queue-filter
133-
nextOnSuccessOrFailure:
134-
decisionTree:
135-
current:
136-
pluginRef: least-kv-cache-filter
137-
nextOnFailure:
138-
decisionTree:
139-
current:
140-
pluginRef: least-queue-filter
141-
nextOnSuccessOrFailure:
142-
decisionTree:
143-
current:
144-
pluginRef: lora-affinity-filter
145-
nextOnSuccessOrFailure:
146-
decisionTree:
147-
current:
148-
pluginRef: least-kv-cache-filter
149-
- type: random-picker
150-
parameters:
151-
maxNumOfEndpoints: 1
152-
- type: single-profile-handler
153-
schedulingProfiles:
154-
- name: default
155-
plugins:
156-
- pluginRef: low-latency-filter
157-
- pluginRef: random-picker
158-
plugins-v2.yaml: |
159109
apiVersion: inference.networking.x-k8s.io/v1alpha1
160110
kind: EndpointPickerConfig
161111
plugins:
162112
- type: queue-scorer
163113
- type: kv-cache-utilization-scorer
164114
- type: prefix-cache-scorer
165-
parameters:
166-
hashBlockSize: 64
167-
maxPrefixBlocksToMatch: 256
168-
lruCapacityPerServer: 31250
169115
- type: max-score-picker
170-
parameters:
171-
maxNumOfEndpoints: 1
172116
- type: single-profile-handler
173117
schedulingProfiles:
174118
- name: default
175119
plugins:
176120
- pluginRef: queue-scorer
177-
weight: 1
178121
- pluginRef: kv-cache-utilization-scorer
179-
weight: 1
180122
- pluginRef: prefix-cache-scorer
181-
weight: 1
182-
- pluginRef: max-score-picker
183123
---
184124
kind: Role
185125
apiVersion: rbac.authorization.k8s.io/v1

site-src/guides/epp-configuration/config-text.md

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ Filters out pods who's waiting queue size is greater than the specified theshold
219219
- *Parameters*:
220220
- `threshold` the waiting queue threshold. If not specified defaults to `128`
221221

222-
#### **PrefixCachePlugin**
222+
#### **PrefixCacheScorer**
223223

224224
Scores pods based on the amount of the prompt is believed to be in the pod's KvCache.
225225

@@ -232,19 +232,32 @@ Scores pods based on the amount of the prompt is believed to be in the pod's KvC
232232
- `lruCapacityPerServer` specifies the capacity of the LRU indexer in number of entries
233233
per server (pod). If not specified defaults to `31250`
234234

235+
#### **LoRAAffinityScorer**
236+
237+
Scores pods based on whether the requested LoRA adapter is already loaded in the pod's HBM, or if
238+
the pod is ready to load the LoRA on demand.
239+
240+
- *Type*: lora-affinity-scorer
241+
- *Parameters*: none
242+
235243
#### **MaxScorePicker**
236244

237-
Picks the pod with the maximum score from the list of candidates.
245+
Picks the pod with the maximum score from the list of candidates. This is the default picker plugin
246+
if not specified.
238247

239248
- *Type*: max-score-picker
240-
- *Parameters*: none
249+
- *Parameters*:
250+
- `maxNumOfEndpoints`: Maximum number of endpoints to pick from the list of candidates, based on
251+
the scores of those endpoints. If not specified defaults to `1`.
241252

242253
#### **RandomPicker**
243254

244255
Picks a random pod from the list of candidates.
245256

246257
- *Type*: random-picker
247-
- *Parameters*: none
258+
- *Parameters*:
259+
- `maxNumOfEndpoints`: Maximum number of endpoints to pick from the list of candidates. If not
260+
specified defaults to `1`.
248261

249262
#### **KvCacheScorer**
250263

site-src/guides/epp-configuration/prefix-aware.md

Lines changed: 7 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -8,35 +8,24 @@ and queue depth.
88

99
## Enable the prefix cache plugin
1010

11-
Currently prefix cache aware plugin is implemented in the V2 scheduler as an experimental feature.
12-
To enable it, set the following environment variables when starting the EndpointPicker(EPP).
13-
14-
```
15-
EXPERIMENTAL_USE_SCHEDULER_V2: true
16-
ENABLE_PREFIX_CACHE_SCHEDULING: true
17-
```
18-
19-
See the [Use Helm section](#helm) to install an inferencepool with the environment variables.
20-
11+
Like any other plugins, the prefix cache aware plugin can be enabled/disabled via the [plugin config file](config-text.md), and is enabled in the [default configuration](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/templates/epp-config.yaml).
2112

2213
## Customize the prefix cache plugin
2314

24-
The prefix cache plugin exposes the following advanced configuration options via environment variables:
15+
The prefix cache plugin exposes the following advanced configuration parameters:
2516

26-
* `PREFIX_CACHE_HASH_BLOCK_SIZE`: The plugin matches prefixes in the unit of blocks. This is the size
17+
* `hashBlockSize`: The plugin matches prefixes in the unit of blocks. This is the size
2718
of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default
2819
is set to 64 in EPP. The default is recommended unless performance is critical for use cases with
2920
extremely long inputs.
3021

31-
* `PREFIX_CACHE_MAX_PREFIX_BLOCKS`: The maximum number of blocks to find prefix match. The default is
32-
128 (or 128*64=8192 characters, or roughly 2048 tokens). This is useful to tradeoff prefix match accuracy
22+
* `maxPrefixBlocksToMatch`: The maximum number of blocks to find prefix match. The default is
23+
256 (or 256*64=16384 characters, or roughly 4096 tokens). This is useful to tradeoff prefix match accuracy
3324
for performance.
3425

35-
* `PREFIX_CACHE_LRU_CAPACITY_PER_SERVER`: Maximum capacity the prefix LRU cache in number of block hashes per server (pod). Below
26+
* `lruCapacityPerServer`: Maximum capacity the prefix LRU cache in number of block hashes per server (pod). Below
3627
shows a detailed analysis on how to estimate this.
3728

38-
39-
4029
The prefix cache plugin estimates the prefix cache indexes in model server HBMs. In the perfect
4130
scenario, EPP has the exact same prefix cache entries per model server as their HBM cache entries. If
4231
the EPP cache is smaller than HBM cache, a positive EPP cache match is more accurate, but there are more
@@ -65,25 +54,4 @@ shows a detailed analysis on how to estimate this.
6554
# assume avg_chars_per_token = 4, prefix_indexer_hash_block_size = 64 (default)
6655
# each entry is about 358KB, so the memory footrpint is abut 11 MB per server
6756
lru_indexer_capacity_per_server = 500,000*4/64 = 31250
68-
```
69-
70-
See the [Use Helm section](#helm) to install an inferencepool with the environment variables.
71-
72-
73-
<a id="helm"></a>
74-
## Use Helm
75-
76-
Use the following reference command to install an inferencepool with the prefix
77-
cache plugin environment variable configurations:
78-
79-
```txt
80-
$ helm install vllm-llama3-8b-instruct \
81-
--set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
82-
--set inferencePool.modelServerType=vllm \
83-
--set provider.name=[none|gke] \
84-
--set inferenceExtension.env.EXPERIMENTAL_USE_SCHEDULER_V2=true \
85-
--set inferenceExtension.env.ENABLE_PREFIX_CACHE_SCHEDULING=true \
86-
--set inferenceExtension.env.PREFIX_CACHE_LRU_CAPACITY_PER_SERVER=31250 \
87-
--set inferenceExtension.env.PREFIX_CACHE_MAX_PREFIX_BLOCKS=1024 \
88-
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
89-
```
57+
```

site-src/guides/inferencepool-rollout.md

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -301,80 +301,20 @@ metadata:
301301
namespace: default
302302
data:
303303
default-plugins.yaml: |
304-
apiVersion: inference.networking.x-k8s.io/v1alpha1
305-
kind: EndpointPickerConfig
306-
plugins:
307-
- type: low-queue-filter
308-
parameters:
309-
threshold: 128
310-
- type: lora-affinity-filter
311-
parameters:
312-
threshold: 0.999
313-
- type: least-queue-filter
314-
- type: least-kv-cache-filter
315-
- type: decision-tree-filter
316-
name: low-latency-filter
317-
parameters:
318-
current:
319-
pluginRef: low-queue-filter
320-
nextOnSuccess:
321-
decisionTree:
322-
current:
323-
pluginRef: lora-affinity-filter
324-
nextOnSuccessOrFailure:
325-
decisionTree:
326-
current:
327-
pluginRef: least-queue-filter
328-
nextOnSuccessOrFailure:
329-
decisionTree:
330-
current:
331-
pluginRef: least-kv-cache-filter
332-
nextOnFailure:
333-
decisionTree:
334-
current:
335-
pluginRef: least-queue-filter
336-
nextOnSuccessOrFailure:
337-
decisionTree:
338-
current:
339-
pluginRef: lora-affinity-filter
340-
nextOnSuccessOrFailure:
341-
decisionTree:
342-
current:
343-
pluginRef: least-kv-cache-filter
344-
- type: random-picker
345-
parameters:
346-
maxNumOfEndpoints: 1
347-
- type: single-profile-handler
348-
schedulingProfiles:
349-
- name: default
350-
plugins:
351-
- pluginRef: low-latency-filter
352-
- pluginRef: random-picker
353-
plugins-v2.yaml: |
354304
apiVersion: inference.networking.x-k8s.io/v1alpha1
355305
kind: EndpointPickerConfig
356306
plugins:
357307
- type: queue-scorer
358308
- type: kv-cache-utilization-scorer
359309
- type: prefix-cache-scorer
360-
parameters:
361-
hashBlockSize: 64
362-
maxPrefixBlocksToMatch: 256
363-
lruCapacityPerServer: 31250
364310
- type: max-score-picker
365-
parameters:
366-
maxNumOfEndpoints: 1
367311
- type: single-profile-handler
368312
schedulingProfiles:
369313
- name: default
370314
plugins:
371315
- pluginRef: queue-scorer
372-
weight: 1
373316
- pluginRef: kv-cache-utilization-scorer
374-
weight: 1
375317
- pluginRef: prefix-cache-scorer
376-
weight: 1
377-
- pluginRef: max-score-picker
378318
EOF
379319
```
380320

0 commit comments

Comments
 (0)