Skip to content

Commit 4420525

Browse files
committed
add helm chart
1 parent b7a66bd commit 4420525

File tree

4 files changed

+390
-0
lines changed

4 files changed

+390
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: v2
2+
name: epp-latency-predictor
3+
description: A Helm chart for EPP deployment with configurable prediction servers
4+
type: application
5+
version: 1.0.0
6+
appVersion: "1.0"
7+
keywords:
8+
- kubernetes
9+
- gateway-api
10+
- inference
11+
- latency-prediction
12+
maintainers:
13+
- name: kaushikmitra
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
## 🔧 Quick Deploy Commands
2+
3+
### Using Helm Chart:
4+
5+
The Helm chart updates the EPP infrastructure with configurable prediction servers deployed as sidecars.
6+
7+
**Prerequisites:** These Helm charts assume you already have the EPP deployed with a working inference gateway. These charts just update the EPP deployment to include prediction sidecars and SLO-aware routing that incorporates predicted latencies for routing signals.
8+
9+
```bash
10+
cd epp-with-predictor-helm-chart
11+
helm install epp ./ --set predictionServers.count=10
12+
```
13+
14+
### Cleanup:
15+
16+
```bash
17+
helm uninstall epp
18+
```
19+
Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
{{- /*
2+
Helper template to generate prediction server URLs for EPP environment variable
3+
*/ -}}
4+
{{- define "epp.predictionServerUrls" -}}
5+
{{- $urls := list -}}
6+
{{- range $i := until (int .Values.predictionServers.count) -}}
7+
{{- $port := add $.Values.predictionServers.basePort $i -}}
8+
{{- $urls = append $urls (printf "http://localhost:%d" $port) -}}
9+
{{- end -}}
10+
{{- join "," $urls -}}
11+
{{- end -}}
12+
13+
# --- ServiceAccount (needed by the Deployment) ---
14+
apiVersion: v1
15+
kind: ServiceAccount
16+
metadata:
17+
name: {{ .Values.inferencePool.name }}-epp
18+
namespace: {{ .Values.inferencePool.namespace }}
19+
---
20+
# --- ConfigMaps (all first) ---
21+
apiVersion: v1
22+
kind: ConfigMap
23+
metadata:
24+
name: latency-predictor-config
25+
namespace: {{ .Values.inferencePool.namespace }}
26+
data:
27+
LATENCY_RETRAINING_INTERVAL_SEC: {{ .Values.latencyPredictor.config.retrainingIntervalSec | quote }}
28+
LATENCY_MIN_SAMPLES_FOR_RETRAIN: {{ .Values.latencyPredictor.config.minSamplesForRetrain | quote }}
29+
LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
30+
LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
31+
LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
32+
LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
33+
LATENCY_MODEL_TYPE: {{ .Values.latencyPredictor.config.modelType | quote }}
34+
LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: {{ .Values.latencyPredictor.config.maxTrainingDataSizePerBucket | quote }}
35+
LATENCY_QUANTILE_ALPHA: {{ .Values.latencyPredictor.config.quantileAlpha | quote }}
36+
---
37+
apiVersion: v1
38+
kind: ConfigMap
39+
metadata:
40+
name: prediction-server-config
41+
namespace: {{ .Values.inferencePool.namespace }}
42+
data:
43+
LATENCY_MODEL_TYPE: {{ .Values.latencyPredictor.config.modelType | quote }}
44+
PREDICT_HOST: "0.0.0.0"
45+
LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib"
46+
LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
47+
LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
48+
LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"
49+
---
50+
apiVersion: v1
51+
kind: ConfigMap
52+
metadata:
53+
name: plugins-config
54+
namespace: {{ .Values.inferencePool.namespace }}
55+
data:
56+
default-plugins.yaml: |
57+
apiVersion: inference.networking.x-k8s.io/v1alpha1
58+
kind: EndpointPickerConfig
59+
plugins:
60+
- type: queue-scorer
61+
- type: kv-cache-utilization-scorer
62+
- type: slo-aware-routing
63+
- type: slo-aware-profile-handler
64+
- type: max-score-picker
65+
- type: prefix-cache-scorer
66+
schedulingProfiles:
67+
- name: default
68+
plugins:
69+
- pluginRef: slo-aware-routing
70+
weight: 0
71+
- pluginRef: queue-scorer
72+
- pluginRef: kv-cache-utilization-scorer
73+
- pluginRef: prefix-cache-scorer
74+
- pluginRef: max-score-picker
75+
- name: slo
76+
plugins:
77+
- pluginRef: prefix-cache-scorer
78+
weight: 0
79+
- pluginRef: slo-aware-routing
80+
- pluginRef: max-score-picker
81+
---
82+
# --- EPP Deployment ---
83+
apiVersion: apps/v1
84+
kind: Deployment
85+
metadata:
86+
name: {{ .Values.inferencePool.name }}-epp
87+
namespace: {{ .Values.inferencePool.namespace }}
88+
labels:
89+
app: {{ .Values.inferencePool.name }}-epp
90+
spec:
91+
replicas: {{ .Values.deployment.replicas }}
92+
selector:
93+
matchLabels:
94+
app: {{ .Values.inferencePool.name }}-epp
95+
template:
96+
metadata:
97+
labels:
98+
app: {{ .Values.inferencePool.name }}-epp
99+
spec:
100+
serviceAccountName: {{ .Values.inferencePool.name }}-epp
101+
containers:
102+
- name: epp
103+
image: {{ .Values.epp.image }}
104+
imagePullPolicy: {{ .Values.epp.imagePullPolicy }}
105+
args:
106+
- -pool-name
107+
- {{ .Values.inferencePool.name | quote }}
108+
- "-pool-namespace"
109+
- {{ .Values.inferencePool.namespace | quote }}
110+
- --pool-group
111+
- "inference.networking.x-k8s.io"
112+
- -v
113+
- {{ .Values.epp.verbosity | quote }}
114+
- --zap-encoder
115+
- "json"
116+
- -grpc-port
117+
- {{ .Values.epp.grpcPort | quote }}
118+
- -grpc-health-port
119+
- {{ .Values.epp.grpcHealthPort | quote }}
120+
- "--config-file"
121+
- "/config/default-plugins.yaml"
122+
- "-enable-latency-predictor"
123+
env:
124+
- name: PREDICTION_SERVER_URL
125+
value: {{ include "epp.predictionServerUrls" . | quote }}
126+
- name: TRAINING_SERVER_URL
127+
value: "http://localhost:{{ .Values.trainingServer.port }}"
128+
- name: LATENCY_MAX_SAMPLE_SIZE
129+
value: {{ .Values.latencyPredictor.maxSampleSize | quote }}
130+
ports:
131+
- containerPort: {{ .Values.epp.grpcPort }}
132+
- containerPort: {{ .Values.epp.grpcHealthPort }}
133+
- name: metrics
134+
containerPort: {{ .Values.epp.metricsPort }}
135+
livenessProbe:
136+
grpc:
137+
port: {{ .Values.epp.grpcHealthPort }}
138+
service: inference-extension
139+
initialDelaySeconds: 5
140+
periodSeconds: 10
141+
readinessProbe:
142+
grpc:
143+
port: {{ .Values.epp.grpcHealthPort }}
144+
service: inference-extension
145+
initialDelaySeconds: 5
146+
periodSeconds: 10
147+
volumeMounts:
148+
- name: plugins-config-volume
149+
mountPath: "/config"
150+
- name: training-server
151+
image: {{ .Values.trainingServer.image }}
152+
imagePullPolicy: {{ .Values.trainingServer.imagePullPolicy }}
153+
ports:
154+
- containerPort: {{ .Values.trainingServer.port }}
155+
name: training-port
156+
livenessProbe:
157+
httpGet:
158+
path: /healthz
159+
port: {{ .Values.trainingServer.port }}
160+
initialDelaySeconds: {{ .Values.trainingServer.livenessProbe.initialDelaySeconds }}
161+
periodSeconds: {{ .Values.trainingServer.livenessProbe.periodSeconds }}
162+
readinessProbe:
163+
httpGet:
164+
path: /readyz
165+
port: {{ .Values.trainingServer.port }}
166+
initialDelaySeconds: {{ .Values.trainingServer.readinessProbe.initialDelaySeconds }}
167+
periodSeconds: {{ .Values.trainingServer.readinessProbe.periodSeconds }}
168+
resources:
169+
requests:
170+
cpu: {{ .Values.trainingServer.resources.requests.cpu | quote }}
171+
memory: {{ .Values.trainingServer.resources.requests.memory | quote }}
172+
limits:
173+
cpu: {{ .Values.trainingServer.resources.limits.cpu | quote }}
174+
memory: {{ .Values.trainingServer.resources.limits.memory | quote }}
175+
envFrom:
176+
- configMapRef:
177+
name: latency-predictor-config
178+
env:
179+
- name: POD_NAME
180+
valueFrom:
181+
fieldRef:
182+
fieldPath: metadata.name
183+
- name: SERVER_TYPE
184+
value: "training"
185+
volumeMounts:
186+
- name: training-server-storage
187+
mountPath: /models
188+
{{- range $i := until (int .Values.predictionServers.count) }}
189+
{{- $port := add $.Values.predictionServers.basePort $i }}
190+
{{- $serverNum := add $i 1 }}
191+
- name: prediction-server-{{ $serverNum }}
192+
image: {{ $.Values.predictionServers.image }}
193+
imagePullPolicy: {{ $.Values.predictionServers.imagePullPolicy }}
194+
command: ["uvicorn"]
195+
args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ $port }}"]
196+
ports:
197+
- containerPort: {{ $port }}
198+
name: predict-port-{{ $serverNum }}
199+
livenessProbe:
200+
httpGet:
201+
path: /healthz
202+
port: {{ $port }}
203+
initialDelaySeconds: {{ $.Values.predictionServers.livenessProbe.initialDelaySeconds }}
204+
periodSeconds: {{ $.Values.predictionServers.livenessProbe.periodSeconds }}
205+
readinessProbe:
206+
httpGet:
207+
path: /readyz
208+
port: {{ $port }}
209+
initialDelaySeconds: {{ $.Values.predictionServers.readinessProbe.initialDelaySeconds }}
210+
periodSeconds: {{ $.Values.predictionServers.readinessProbe.periodSeconds }}
211+
failureThreshold: {{ $.Values.predictionServers.readinessProbe.failureThreshold }}
212+
resources:
213+
requests:
214+
cpu: {{ $.Values.predictionServers.resources.requests.cpu | quote }}
215+
memory: {{ $.Values.predictionServers.resources.requests.memory | quote }}
216+
limits:
217+
cpu: {{ $.Values.predictionServers.resources.limits.cpu | quote }}
218+
memory: {{ $.Values.predictionServers.resources.limits.memory | quote }}
219+
envFrom:
220+
- configMapRef:
221+
name: prediction-server-config
222+
env:
223+
- name: PREDICT_PORT
224+
value: "{{ $port }}"
225+
- name: POD_NAME
226+
valueFrom:
227+
fieldRef:
228+
fieldPath: metadata.name
229+
- name: SERVER_TYPE
230+
value: "prediction-{{ $serverNum }}"
231+
- name: TRAINING_SERVER_URL
232+
value: "http://localhost:{{ $.Values.trainingServer.port }}"
233+
volumeMounts:
234+
- name: prediction-server-{{ $serverNum }}-storage
235+
mountPath: /server_models
236+
{{- end }}
237+
volumes:
238+
- name: training-server-storage
239+
emptyDir:
240+
sizeLimit: {{ .Values.trainingServer.storage.sizeLimit | quote }}
241+
{{- range $i := until (int .Values.predictionServers.count) }}
242+
{{- $serverNum := add $i 1 }}
243+
- name: prediction-server-{{ $serverNum }}-storage
244+
emptyDir:
245+
sizeLimit: {{ $.Values.predictionServers.storage.sizeLimit | quote }}
246+
{{- end }}
247+
- name: plugins-config-volume
248+
configMap:
249+
name: plugins-config
250+
---
251+
# --- EPP Service ---
252+
apiVersion: v1
253+
kind: Service
254+
metadata:
255+
name: {{ .Values.inferencePool.name }}-epp
256+
namespace: {{ .Values.inferencePool.namespace }}
257+
spec:
258+
selector:
259+
app: {{ .Values.inferencePool.name }}-epp
260+
ports:
261+
- name: epp-grpc
262+
protocol: TCP
263+
port: {{ .Values.epp.grpcPort }}
264+
targetPort: {{ .Values.epp.grpcPort }}
265+
appProtocol: http2
266+
- name: latency-predictor-training
267+
protocol: TCP
268+
port: {{ .Values.trainingServer.port }}
269+
targetPort: {{ .Values.trainingServer.port }}
270+
{{- range $i := until (int .Values.predictionServers.count) }}
271+
{{- $port := add $.Values.predictionServers.basePort $i }}
272+
- name: latency-predictor-{{ add $i 1 }}
273+
protocol: TCP
274+
port: {{ $port }}
275+
targetPort: {{ $port }}
276+
{{- end }}
277+
- name: prometheus
278+
protocol: TCP
279+
port: {{ .Values.epp.metricsPort }}
280+
targetPort: {{ .Values.epp.metricsPort }}
281+
type: LoadBalancer
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Helm values for configurable EPP deployment
2+
3+
# Number of prediction server sidecars (1-20 recommended)
4+
predictionServers:
5+
count: 10
6+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v3-prediction-server:latest
7+
imagePullPolicy: Always
8+
basePort: 8001 # First prediction server will use this port, subsequent ones increment
9+
resources:
10+
requests:
11+
cpu: "500m"
12+
memory: "1Gi"
13+
limits:
14+
cpu: "1000m"
15+
memory: "2Gi"
16+
storage:
17+
sizeLimit: "10Gi"
18+
livenessProbe:
19+
initialDelaySeconds: 15
20+
periodSeconds: 15
21+
readinessProbe:
22+
initialDelaySeconds: 10
23+
periodSeconds: 5
24+
failureThreshold: 10
25+
26+
# Training server configuration
27+
trainingServer:
28+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v3-training-server:latest
29+
imagePullPolicy: Always
30+
port: 8000
31+
resources:
32+
requests:
33+
cpu: "2000m"
34+
memory: "4Gi"
35+
limits:
36+
cpu: "4000m"
37+
memory: "8Gi"
38+
storage:
39+
sizeLimit: "20Gi"
40+
livenessProbe:
41+
initialDelaySeconds: 30
42+
periodSeconds: 20
43+
readinessProbe:
44+
initialDelaySeconds: 45
45+
periodSeconds: 10
46+
47+
# EPP container configuration
48+
epp:
49+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/epp-wlp-latencypredictor-v2
50+
imagePullPolicy: Always
51+
grpcPort: 9002
52+
grpcHealthPort: 9003
53+
metricsPort: 9090
54+
verbosity: 4
55+
56+
# InferencePool configuration
57+
inferencePool:
58+
name: vllm-llama3-8b-instruct
59+
namespace: default
60+
targetPortNumber: 8000
61+
62+
# Latency predictor configuration
63+
latencyPredictor:
64+
config:
65+
retrainingIntervalSec: "1"
66+
minSamplesForRetrain: "100"
67+
modelType: "xgboost"
68+
maxTrainingDataSizePerBucket: "5000"
69+
quantileAlpha: "0.9"
70+
maxSampleSize: "10000"
71+
72+
# Deployment configuration
73+
deployment:
74+
replicas: 1
75+
76+
77+

0 commit comments

Comments
 (0)