Skip to content

Commit 40544e2

Browse files
authored
feat: support configurable inferencepool (envoyproxy#1239)
1 parent cc9267b commit 40544e2

File tree

6 files changed

+765
-11
lines changed

6 files changed

+765
-11
lines changed
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
# Copyright Envoy AI Gateway Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
# The full text of the Apache license is available in the LICENSE file at
4+
# the root of the repo.
5+
6+
# This example demonstrates how to use InferencePool annotations to configure
7+
# the external processor's processing mode and allow mode override settings.
8+
9+
apiVersion: v1
10+
kind: Service
11+
metadata:
12+
name: mistral-upstream
13+
namespace: default
14+
spec:
15+
selector:
16+
app: mistral-upstream
17+
ports:
18+
- protocol: TCP
19+
port: 8080
20+
targetPort: 8080
21+
type: ClusterIP
22+
---
23+
apiVersion: apps/v1
24+
kind: Deployment
25+
metadata:
26+
name: mistral-upstream
27+
namespace: default
28+
labels:
29+
app: mistral-upstream
30+
spec:
31+
replicas: 1
32+
selector:
33+
matchLabels:
34+
app: mistral-upstream
35+
template:
36+
metadata:
37+
labels:
38+
app: mistral-upstream
39+
spec:
40+
containers:
41+
- name: upstream
42+
image: registry.k8s.io/ai-gateway/testupstream:v0.0.0-latest
43+
imagePullPolicy: IfNotPresent
44+
ports:
45+
- containerPort: 8080
46+
env:
47+
- name: UPSTREAM_PORT
48+
value: "8080"
49+
readinessProbe:
50+
httpGet:
51+
path: /health
52+
port: 8080
53+
initialDelaySeconds: 1
54+
periodSeconds: 1
55+
---
56+
apiVersion: inference.networking.x-k8s.io/v1alpha2
57+
kind: InferencePool
58+
metadata:
59+
name: mistral-with-annotations
60+
namespace: default
61+
annotations:
62+
# Configure processing body mode: "duplex" (default) or "buffered"
63+
# "duplex" corresponds to ProcessingMode_FULL_DUPLEX_STREAMED
64+
# "buffered" corresponds to ProcessingMode_BUFFERED
65+
aigateway.envoyproxy.io/processing-body-mode: "buffered"
66+
67+
# Configure allow mode override: "false" (default) or "true"
68+
# This corresponds to the AllowModeOverride field in Envoy's ExternalProcessor
69+
aigateway.envoyproxy.io/allow-mode-override: "true"
70+
spec:
71+
targetPortNumber: 8080
72+
selector:
73+
app: mistral-upstream
74+
extensionRef:
75+
name: mistral-epp-with-annotations
76+
---
77+
apiVersion: inference.networking.x-k8s.io/v1alpha2
78+
kind: InferenceModel
79+
metadata:
80+
name: mistral-with-annotations
81+
namespace: default
82+
spec:
83+
modelName: mistral:latest
84+
criticality: Critical
85+
poolRef:
86+
# Bind the InferenceModel to the InferencePool.
87+
name: mistral-with-annotations
88+
---
89+
apiVersion: v1
90+
kind: Service
91+
metadata:
92+
name: mistral-epp-with-annotations
93+
namespace: default
94+
spec:
95+
selector:
96+
app: mistral-epp-with-annotations
97+
ports:
98+
- protocol: TCP
99+
port: 9002
100+
targetPort: 9002
101+
appProtocol: http2
102+
type: ClusterIP
103+
---
104+
apiVersion: apps/v1
105+
kind: Deployment
106+
metadata:
107+
name: mistral-epp-with-annotations
108+
namespace: default
109+
labels:
110+
app: mistral-epp-with-annotations
111+
spec:
112+
replicas: 1
113+
selector:
114+
matchLabels:
115+
app: mistral-epp-with-annotations
116+
template:
117+
metadata:
118+
labels:
119+
app: mistral-epp-with-annotations
120+
spec:
121+
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
122+
terminationGracePeriodSeconds: 130
123+
containers:
124+
- name: epp
125+
image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
126+
imagePullPolicy: IfNotPresent
127+
args:
128+
- -poolName
129+
- "mistral-with-annotations"
130+
- "-poolNamespace"
131+
- "default"
132+
- -v
133+
- "4"
134+
- --zap-encoder
135+
- "json"
136+
- -grpcPort
137+
- "9002"
138+
- -grpcHealthPort
139+
- "9003"
140+
- "-configFile"
141+
- "/config/default-plugins.yaml"
142+
ports:
143+
- containerPort: 9002
144+
- containerPort: 9003
145+
- name: metrics
146+
containerPort: 9090
147+
livenessProbe:
148+
grpc:
149+
port: 9003
150+
service: inference-extension
151+
initialDelaySeconds: 5
152+
periodSeconds: 10
153+
readinessProbe:
154+
grpc:
155+
port: 9003
156+
service: inference-extension
157+
initialDelaySeconds: 5
158+
periodSeconds: 10
159+
volumeMounts:
160+
- name: plugins-config-volume
161+
mountPath: "/config"
162+
volumes:
163+
- name: plugins-config-volume
164+
configMap:
165+
name: plugins-config
166+
---
167+
apiVersion: v1
168+
kind: ConfigMap
169+
metadata:
170+
name: plugins-config
171+
namespace: default
172+
data:
173+
default-plugins.yaml: |
174+
apiVersion: inference.networking.x-k8s.io/v1alpha1
175+
kind: EndpointPickerConfig
176+
plugins:
177+
- type: low-queue-filter
178+
parameters:
179+
threshold: 128
180+
- type: lora-affinity-filter
181+
parameters:
182+
threshold: 0.999
183+
- type: least-queue-filter
184+
- type: least-kv-cache-filter
185+
- type: decision-tree-filter
186+
name: low-latency-filter
187+
parameters:
188+
current:
189+
pluginRef: low-queue-filter
190+
nextOnSuccess:
191+
decisionTree:
192+
current:
193+
pluginRef: lora-affinity-filter
194+
nextOnSuccess:
195+
pluginRef: least-queue-filter
196+
nextOnFailure:
197+
pluginRef: least-kv-cache-filter
198+
nextOnFailure:
199+
pluginRef: least-kv-cache-filter
200+
- type: queue-scorer
201+
parameters:
202+
maxQueueSize: 128
203+
- type: kv-cache-scorer
204+
parameters:
205+
maxKVCacheUsage: 0.95
206+
- type: prefix-cache-scorer
207+
parameters:
208+
hashBlockSize: 64
209+
maxPrefixBlocksToMatch: 256
210+
lruCapacityPerServer: 31250
211+
- type: max-score-picker
212+
parameters:
213+
maxNumOfEndpoints: 1
214+
- type: single-profile-handler
215+
schedulingProfiles:
216+
- name: default
217+
plugins:
218+
- pluginRef: queue-scorer
219+
weight: 1
220+
- pluginRef: kv-cache-scorer
221+
weight: 1
222+
- pluginRef: prefix-cache-scorer
223+
weight: 1
224+
- pluginRef: max-score-picker
225+
---
226+
kind: ClusterRole
227+
apiVersion: rbac.authorization.k8s.io/v1
228+
metadata:
229+
name: pod-read
230+
rules:
231+
- apiGroups: ["inference.networking.x-k8s.io"]
232+
resources: ["inferencepools"]
233+
verbs: ["get", "watch", "list"]
234+
- apiGroups: ["inference.networking.x-k8s.io"]
235+
resources: ["inferencemodels"]
236+
verbs: ["get", "watch", "list"]
237+
- apiGroups: [""]
238+
resources: ["pods"]
239+
verbs: ["get", "watch", "list"]
240+
---
241+
kind: ClusterRoleBinding
242+
apiVersion: rbac.authorization.k8s.io/v1
243+
metadata:
244+
name: pod-read
245+
subjects:
246+
- kind: ServiceAccount
247+
name: default
248+
namespace: default
249+
roleRef:
250+
kind: ClusterRole
251+
name: pod-read
252+
apiGroup: rbac.authorization.k8s.io

0 commit comments

Comments
 (0)