Skip to content

Commit 9a5491f

Browse files
authored
Conformance: Fixes the EPP ConfigMap Namespace (#1166)
* Conformance: Fixes the EPP ConfigMap namespace Signed-off-by: Daneyon Hansen <[email protected]> * Renames config file in rollout.md Signed-off-by: Daneyon Hansen <[email protected]> --------- Signed-off-by: Daneyon Hansen <[email protected]>
1 parent 9e50fa2 commit 9a5491f

File tree

3 files changed

+133
-39
lines changed

3 files changed

+133
-39
lines changed

config/manifests/inferencepool-resources.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
# Note: If you change this file, please also change the file used for e2e tests!
2-
#
3-
# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
1+
# Note: If you change this file, please also change:
2+
# - ./test/testdata/inferencepool-e2e.yaml
3+
# - ./conformance/resources/manifests/manifests.yaml
4+
# - ./site-src/guides/inferencepool-rollout.md
5+
---
46
apiVersion: inference.networking.x-k8s.io/v1alpha2
57
kind: InferencePool
68
metadata:

conformance/resources/manifests/manifests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ apiVersion: v1
336336
kind: ConfigMap
337337
metadata:
338338
name: plugins-config
339-
namespace: default
339+
namespace: gateway-conformance-app-backend
340340
data:
341341
conformance-plugins.yaml: |
342342
apiVersion: inference.networking.x-k8s.io/v1alpha1

site-src/guides/inferencepool-rollout.md

Lines changed: 127 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ spec:
177177
terminationGracePeriodSeconds: 130
178178
nodeSelector:
179179
cloud.google.com/gke-accelerator: "nvidia-h100-80gb"
180-
181180
volumes:
182181
- name: data
183182
emptyDir: {}
@@ -250,40 +249,133 @@ spec:
250249
spec:
251250
terminationGracePeriodSeconds: 130
252251
containers:
253-
- name: epp
254-
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
255-
imagePullPolicy: Always
256-
args:
257-
- -poolName
258-
- "vllm-llama3-8b-instruct-new"
259-
- "-poolNamespace"
260-
- "default"
261-
- -v
262-
- "4"
263-
- --zap-encoder
264-
- "json"
265-
- -grpcPort
266-
- "9002"
267-
- -grpcHealthPort
268-
- "9003"
269-
ports:
270-
- containerPort: 9002
271-
- containerPort: 9003
272-
- name: metrics
273-
containerPort: 9090
274-
livenessProbe:
275-
grpc:
276-
port: 9003
277-
service: inference-extension
278-
initialDelaySeconds: 5
279-
periodSeconds: 10
280-
readinessProbe:
281-
grpc:
282-
port: 9003
283-
service: inference-extension
284-
initialDelaySeconds: 5
285-
periodSeconds: 10
286-
EOF
252+
- name: epp
253+
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
254+
imagePullPolicy: Always
255+
args:
256+
- -poolName
257+
- "vllm-llama3-8b-instruct-new"
258+
- -poolNamespace
259+
- "default"
260+
- -v
261+
- "4"
262+
- --zap-encoder
263+
- "json"
264+
- -grpcPort
265+
- "9002"
266+
- -grpcHealthPort
267+
- "9003"
268+
- -configFile
269+
- "/config/default-plugins.yaml"
270+
ports:
271+
- containerPort: 9002
272+
name: grpc
273+
- containerPort: 9003
274+
name: grpc-health
275+
- containerPort: 9090
276+
name: metrics
277+
livenessProbe:
278+
grpc:
279+
port: 9003
280+
service: inference-extension
281+
initialDelaySeconds: 5
282+
periodSeconds: 10
283+
readinessProbe:
284+
grpc:
285+
port: 9003
286+
service: inference-extension
287+
initialDelaySeconds: 5
288+
periodSeconds: 10
289+
volumeMounts:
290+
- name: plugins-config-volume
291+
mountPath: /config
292+
volumes:
293+
- name: plugins-config-volume
294+
configMap:
295+
name: plugins-config
296+
---
297+
apiVersion: v1
298+
kind: ConfigMap
299+
metadata:
300+
name: plugins-config
301+
namespace: default
302+
data:
303+
default-plugins.yaml: |
304+
apiVersion: inference.networking.x-k8s.io/v1alpha1
305+
kind: EndpointPickerConfig
306+
plugins:
307+
- type: low-queue-filter
308+
parameters:
309+
threshold: 128
310+
- type: lora-affinity-filter
311+
parameters:
312+
threshold: 0.999
313+
- type: least-queue-filter
314+
- type: least-kv-cache-filter
315+
- type: decision-tree-filter
316+
name: low-latency-filter
317+
parameters:
318+
current:
319+
pluginRef: low-queue-filter
320+
nextOnSuccess:
321+
decisionTree:
322+
current:
323+
pluginRef: lora-affinity-filter
324+
nextOnSuccessOrFailure:
325+
decisionTree:
326+
current:
327+
pluginRef: least-queue-filter
328+
nextOnSuccessOrFailure:
329+
decisionTree:
330+
current:
331+
pluginRef: least-kv-cache-filter
332+
nextOnFailure:
333+
decisionTree:
334+
current:
335+
pluginRef: least-queue-filter
336+
nextOnSuccessOrFailure:
337+
decisionTree:
338+
current:
339+
pluginRef: lora-affinity-filter
340+
nextOnSuccessOrFailure:
341+
decisionTree:
342+
current:
343+
pluginRef: least-kv-cache-filter
344+
- type: random-picker
345+
parameters:
346+
maxNumOfEndpoints: 1
347+
- type: single-profile-handler
348+
schedulingProfiles:
349+
- name: default
350+
plugins:
351+
- pluginRef: low-latency-filter
352+
- pluginRef: random-picker
353+
plugins-v2.yaml: |
354+
apiVersion: inference.networking.x-k8s.io/v1alpha1
355+
kind: EndpointPickerConfig
356+
plugins:
357+
- type: queue-scorer
358+
- type: kv-cache-scorer
359+
- type: prefix-cache-scorer
360+
parameters:
361+
hashBlockSize: 64
362+
maxPrefixBlocksToMatch: 256
363+
lruCapacityPerServer: 31250
364+
- type: max-score-picker
365+
parameters:
366+
maxNumOfEndpoints: 1
367+
- type: single-profile-handler
368+
schedulingProfiles:
369+
- name: default
370+
plugins:
371+
- pluginRef: queue-scorer
372+
weight: 1
373+
- pluginRef: kv-cache-scorer
374+
weight: 1
375+
- pluginRef: prefix-cache-scorer
376+
weight: 1
377+
- pluginRef: max-score-picker
378+
EOF
287379
```
288380

289381
### Direct traffic to the new inference pool

0 commit comments

Comments
 (0)