kubernetes-sigs
diff --git a/‎config/manifests/bbr-example/httproute_bbr.yaml‎
Lines changed: 51 additions & 0 deletions b/‎config/manifests/bbr-example/httproute_bbr.yaml‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎config/manifests/bbr-example/vllm-phi4-mini.yaml‎
Lines changed: 99 additions & 0 deletions b/‎config/manifests/bbr-example/vllm-phi4-mini.yaml‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎site-src/guides/index.md‎
Lines changed: 5 additions & 1 deletion b/‎site-src/guides/index.md‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,51 @@
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-llama-route
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: vllm-llama3-8b-instruct
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          name: X-Gateway-Model-Name
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
+    timeouts:
+      request: 300s
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-phi4-route
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: vllm-phi4-mini-instruct
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          name: X-Gateway-Model-Name
+          value: 'microsoft/Phi-4-mini-instruct'
+    timeouts:
+      request: 300s
+---
@@ -0,0 +1,99 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: phi4-mini
+  namespace: default
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+  volumeMode: Filesystem
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi4-mini
+  namespace: default
+  labels:
+    app: phi4-mini
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi4-mini
+  template:
+    metadata:
+      labels:
+        app: phi4-mini
+    spec:
+      volumes:
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: phi4-mini
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      #      - name: shm
+      #  emptyDir:
+      #    medium: Memory
+            #          sizeLimit: "2Gi"
+      containers:
+      - name: phi4-mini
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve microsoft/Phi-4-mini-instruct --trust-remote-code --enable-chunked-prefill" 
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            #            cpu: "10"
+            #            memory: 40G
+            nvidia.com/gpu: "1"
+          requests:
+            #            cpu: "10"
+            #            memory: 40Gi
+            nvidia.com/gpu: "1"
+        volumeMounts:
+        - mountPath: /root/.cache/huggingface
+          name: cache-volume
+            #        - name: shm
+            #          mountPath: /dev/shm
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 600
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 600
+          periodSeconds: 5
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: phi4-mini
+  namespace: default
+spec:
+  ports:
+  - name: http-phi4-mini
+    port: 80
+    protocol: TCP
+    targetPort: 8000
+  # The label selector should match the deployment labels & it is useful for prefix caching feature
+  selector:
+    app: phi4-mini
+  sessionAffinity: None
+  type: ClusterIP
+
@@ -178,7 +178,7 @@ Tooling:
       2. Install Istio
 
          ```
-         TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.27-dev)
+         TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.28-dev)
          # on Linux
          wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz
          tar -xvf istioctl-$TAG-linux-amd64.tar.gz
@@ -319,6 +319,10 @@ Tooling:
          kubectl get httproute llm-route -o yaml
          ```
 
+### Deploy the Body Based Router Extension (Optional)
+
+    This guide shows how to get started with serving only 1 base model type per L7 URL path. If in addition, you wish to exercise model-aware routing such that more than 1 base model is served at the same L7 url path, that requires use of the (optional) Body Based Routing (BBR) extension which is described in a following section of the guide, namely the [`Serving Multiple GenAI Models`](serve-multiple-genai-models.md) section.
+
 ### Deploy InferenceObjective (Optional)
 
    Deploy the sample InferenceObjective which allows you to specify priority of requests.