Skip to content

Commit f1a34ab

Browse files
committed
Add BBR user guide, yaml for model-aware routing
1 parent 1457f63 commit f1a34ab

File tree

5 files changed

+332
-93
lines changed

5 files changed

+332
-93
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
---
2+
apiVersion: gateway.networking.k8s.io/v1
3+
kind: HTTPRoute
4+
metadata:
5+
name: llm-llama-route
6+
spec:
7+
parentRefs:
8+
- group: gateway.networking.k8s.io
9+
kind: Gateway
10+
name: inference-gateway
11+
rules:
12+
- backendRefs:
13+
- group: inference.networking.k8s.io
14+
kind: InferencePool
15+
name: vllm-llama3-8b-instruct
16+
matches:
17+
- path:
18+
type: PathPrefix
19+
value: /
20+
headers:
21+
- type: Exact
22+
name: X-Gateway-Model-Name
23+
value: 'meta-llama/Llama-3.1-8B-Instruct'
24+
timeouts:
25+
request: 300s
26+
---
27+
apiVersion: gateway.networking.k8s.io/v1
28+
kind: HTTPRoute
29+
metadata:
30+
name: llm-phi4-route
31+
spec:
32+
parentRefs:
33+
- group: gateway.networking.k8s.io
34+
kind: Gateway
35+
name: inference-gateway
36+
rules:
37+
- backendRefs:
38+
- group: inference.networking.k8s.io
39+
kind: InferencePool
40+
name: vllm-phi4-mini-instruct
41+
matches:
42+
- path:
43+
type: PathPrefix
44+
value: /
45+
headers:
46+
- type: Exact
47+
name: X-Gateway-Model-Name
48+
value: 'microsoft/Phi-4-mini-instruct'
49+
timeouts:
50+
request: 300s
51+
---
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
---
2+
apiVersion: v1
3+
kind: PersistentVolumeClaim
4+
metadata:
5+
name: phi4-mini
6+
namespace: default
7+
spec:
8+
accessModes:
9+
- ReadWriteOnce
10+
resources:
11+
requests:
12+
storage: 20Gi
13+
volumeMode: Filesystem
14+
---
15+
apiVersion: apps/v1
16+
kind: Deployment
17+
metadata:
18+
name: phi4-mini
19+
namespace: default
20+
labels:
21+
app: phi4-mini
22+
spec:
23+
replicas: 1
24+
selector:
25+
matchLabels:
26+
app: phi4-mini
27+
template:
28+
metadata:
29+
labels:
30+
app: phi4-mini
31+
spec:
32+
volumes:
33+
- name: cache-volume
34+
persistentVolumeClaim:
35+
claimName: phi4-mini
36+
# vLLM needs to access the host's shared memory for tensor parallel inference.
37+
# - name: shm
38+
# emptyDir:
39+
# medium: Memory
40+
# sizeLimit: "2Gi"
41+
containers:
42+
- name: phi4-mini
43+
image: vllm/vllm-openai:latest
44+
command: ["/bin/sh", "-c"]
45+
args: [
46+
"vllm serve microsoft/Phi-4-mini-instruct --trust-remote-code --enable-chunked-prefill"
47+
]
48+
env:
49+
- name: HUGGING_FACE_HUB_TOKEN
50+
valueFrom:
51+
secretKeyRef:
52+
name: hf-token
53+
key: token
54+
ports:
55+
- containerPort: 8000
56+
resources:
57+
limits:
58+
# cpu: "10"
59+
# memory: 40G
60+
nvidia.com/gpu: "1"
61+
requests:
62+
# cpu: "10"
63+
# memory: 40Gi
64+
nvidia.com/gpu: "1"
65+
volumeMounts:
66+
- mountPath: /root/.cache/huggingface
67+
name: cache-volume
68+
# - name: shm
69+
# mountPath: /dev/shm
70+
livenessProbe:
71+
httpGet:
72+
path: /health
73+
port: 8000
74+
initialDelaySeconds: 600
75+
periodSeconds: 10
76+
readinessProbe:
77+
httpGet:
78+
path: /health
79+
port: 8000
80+
initialDelaySeconds: 600
81+
periodSeconds: 5
82+
---
83+
apiVersion: v1
84+
kind: Service
85+
metadata:
86+
name: phi4-mini
87+
namespace: default
88+
spec:
89+
ports:
90+
- name: http-phi4-mini
91+
port: 80
92+
protocol: TCP
93+
targetPort: 8000
94+
# The label selector should match the deployment labels & it is useful for prefix caching feature
95+
selector:
96+
app: phi4-mini
97+
sessionAffinity: None
98+
type: ClusterIP
99+

site-src/guides/index.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ Tooling:
178178
2. Install Istio
179179

180180
```
181-
TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.27-dev)
181+
TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.28-dev)
182182
# on Linux
183183
wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz
184184
tar -xvf istioctl-$TAG-linux-amd64.tar.gz
@@ -319,6 +319,10 @@ Tooling:
319319
kubectl get httproute llm-route -o yaml
320320
```
321321

322+
### Deploy the Body Based Router Extension (Optional)
323+
324+
This guide shows how to get started with serving only 1 base model type per L7 URL path. If in addition, you wish to exercise model-aware routing such that more than 1 base model is served at the same L7 url path, that requires use of the (optional) Body Based Routing (BBR) extension which is described in a following section of the guide, namely the [`Serving Multiple GenAI Models`](serve-multiple-genai-models.md) section.
325+
322326
### Deploy InferenceObjective (Optional)
323327

324328
Deploy the sample InferenceObjective which allows you to specify priority of requests.

0 commit comments

Comments
 (0)