From 72ededd632fee6ffa406dffc3e39e0d28e37aac2 Mon Sep 17 00:00:00 2001 From: Ernest Wong Date: Tue, 2 Sep 2025 13:31:49 -0700 Subject: [PATCH] docs: update BBR guide Signed-off-by: Ernest Wong --- mkdocs.yml | 5 +- .../guides/serve-multiple-genai-models.md | 126 ++++++++++++++---- 2 files changed, 101 insertions(+), 30 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 982efbf12..78f1cb81c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,6 +12,7 @@ theme: logo: images/logo/logo-text-large-horizontal-white.png favicon: images/favicon-64.png features: + - content.code.annotate - search.highlight - navigation.tabs - navigation.top @@ -55,7 +56,7 @@ nav: Design Principles: concepts/design-principles.md Conformance: concepts/conformance.md Roles and Personas: concepts/roles-and-personas.md - - Implementations: + - Implementations: - Gateways: implementations/gateways.md - Model Servers: implementations/model-servers.md - FAQ: faq.md @@ -70,7 +71,7 @@ nav: - InferencePool Rollout: guides/inferencepool-rollout.md - Metrics and Observability: guides/metrics-and-observability.md - Configuration Guide: - - Configuring the plugins via configuration files or text: guides/epp-configuration/config-text.md + - Configuring the plugins via configuration files or text: guides/epp-configuration/config-text.md - Prefix Cache Aware Plugin: guides/epp-configuration/prefix-aware.md - Troubleshooting Guide: guides/troubleshooting.md - Implementer Guides: diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index 4eb120a4b..ea4281d98 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -1,18 +1,53 @@ # Serve multiple generative AI models -A company wants to deploy multiple large language models (LLMs) to serve different workloads. -For example, they might want to deploy a Gemma3 model for a chatbot interface and a Deepseek model for a recommendation application. + +A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads. +For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application. The company needs to ensure optimal serving performance for these LLMs. -By using an Inference Gateway, you can deploy these LLMs on your cluster with your chosen accelerator configuration in an `InferencePool`. -You can then route requests based on the model name (such as "chatbot" and "recommender") and the `Criticality` property. +By using an Inference Gateway, you can deploy these LLMs on your cluster with your chosen accelerator configuration in an `InferencePool`. +You can then route requests based on the model name (such as `chatbot` and `recommender`) and the `Criticality` property. ## How + The following diagram illustrates how an Inference Gateway routes requests to different models based on the model name. -The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) +The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) (BBR) from the request body to the header. The header is then matched to dispatch requests to different `InferencePool` (and their EPPs) instances. ![Serving multiple generative AI models](../images/serve-mul-gen-AI-models.png) +### Deploy Body-Based Routing + +To enable body-based routing, you need to deploy the Body-Based Routing ExtProc server using Helm. Depending on your Gateway provider, you can use one of the following commands: + +=== "GKE" + + ```bash + helm install body-based-router \ + --set provider.name=gke \ + --version v0.5.1 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` + +=== "Istio" + + ```bash + helm install body-based-router \ + --set provider.name=istio \ + --version v0.5.1 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` + +=== "Other" + + ```bash + helm install body-based-router \ + --version v0.5.1 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` + +### Configure HTTPRoute + This example illustrates a conceptual example regarding how to use the `HTTPRoute` object to route based on model name like “chatbot” or “recommender” to `InferencePool`. + ```yaml apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute @@ -25,8 +60,7 @@ spec: - matches: - headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. - name: X-Gateway-Model-Name + name: X-Gateway-Model-Name # (1)! value: chatbot path: type: PathPrefix @@ -37,38 +71,74 @@ spec: - matches: - headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. - name: X-Gateway-Model-Name + name: X-Gateway-Model-Name # (2)! value: recommender path: type: PathPrefix value: / backendRefs: - name: deepseek-r1 - kind: InferencePool + kind: InferencePool ``` +1. [BBR](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header with key `X-Gateway-Model-Name`. The header can then be used in the `HTTPRoute` to route requests to different `InferencePool` instances. +2. [BBR](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header with key `X-Gateway-Model-Name`. The header can then be used in the `HTTPRoute` to route requests to different `InferencePool` instances. + ## Try it out 1. Get the gateway IP: ```bash IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 ``` -2. Send a few requests to model "chatbot" as follows: -```bash -curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "chatbot", -"prompt": "What is the color of the sky", -"max_tokens": 100, -"temperature": 0 -}' -``` -3. Send a few requests to model "recommender" as follows: -```bash -curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "recommender", -"prompt": "Give me restaurant recommendations in Paris", -"max_tokens": 100, -"temperature": 0 -}' -``` + +=== "Chat Completions API" + + 1. Send a few requests to model `chatbot` as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "chatbot", + "messages": [{"role": "user", "content": "What is the color of the sky?"}], + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 2. Send a few requests to model `recommender` as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "recommender", + "messages": [{"role": "user", "content": "Give me restaurant recommendations in Paris"}], + "max_tokens": 100, + "temperature": 0 + }' + ``` + +=== "Completions API" + + 1. Send a few requests to model `chatbot` as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "chatbot", + "prompt": "What is the color of the sky", + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 2. Send a few requests to model `recommender` as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "recommender", + "prompt": "Give me restaurant recommendations in Paris", + "max_tokens": 100, + "temperature": 0 + }' + ```