From 300fbbe8decd9d867bfbcc8da7deb0eb1c95ee26 Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Thu, 13 Nov 2025 20:18:35 +0200 Subject: [PATCH] Extending serving multiple AI models guide with an example of how to serve multiple LoRAs (many LoRAs per one model while having multiple models) --- .../bbr-example/httproute_bbr_lora.yaml | 93 ++++++ config/manifests/vllm/sim-deployment-1.yaml | 44 +++ .../guides/serve-multiple-genai-models.md | 301 +++++++++++++++++- 3 files changed, 435 insertions(+), 3 deletions(-) create mode 100644 config/manifests/bbr-example/httproute_bbr_lora.yaml create mode 100644 config/manifests/vllm/sim-deployment-1.yaml diff --git a/config/manifests/bbr-example/httproute_bbr_lora.yaml b/config/manifests/bbr-example/httproute_bbr_lora.yaml new file mode 100644 index 000000000..957859787 --- /dev/null +++ b/config/manifests/bbr-example/httproute_bbr_lora.yaml @@ -0,0 +1,93 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-llama-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-deepseek-route #give this HTTPRoute any name that helps you to group and track the matchers +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-deepseek-r1 + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'deepseek/vllm-deepseek-r1' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'movie-critique' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment + timeouts: + request: 300s diff --git a/config/manifests/vllm/sim-deployment-1.yaml b/config/manifests/vllm/sim-deployment-1.yaml new file mode 100644 index 000000000..9798077ca --- /dev/null +++ b/config/manifests/vllm/sim-deployment-1.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-deepseek-r1 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-deepseek-r1 + template: + metadata: + labels: + app: vllm-deepseek-r1 + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0 + imagePullPolicy: Always + args: + - --model + - deepseek/vllm-deepseek-r1 + - --port + - "8000" + - --max-loras + - "2" + - --lora-modules + - '{"name": "food-review"}' + - '{"name": "movie-critique"}' + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + cpu: 10m diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index 25198e88f..fa359e4bb 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -1,9 +1,11 @@ -# Serve multiple generative AI models +# Serve multiple generative AI models and multiple LoRAs for the base AI models A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads. For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application (or as in the example in this guide, a combination of a Llama3 model and a smaller Phi4 model).. You may choose to locate these 2 models at 2 different L7 url paths and follow the steps described in the [`Getting started`](index.md) guide for each such model as already described. However you may also need to serve multiple models located at the same L7 url path and rely on parsing information such as the Model name in the LLM prompt requests as defined in the OpenAI API format which is commonly used by most models. For such Model-aware routing, you can use the Body-Based Routing feature as described in this guide. +In addition, for each base AI model multiple [Low Rank Adaptaions (LoRAs)](https://www.ibm.com/think/topics/lora) can be defined. LoRAs defined for the same base AI model are served from the same backend inference server that serves the base model. A LoRA name is specified as the Model name in the body of LLM prompt requests. LoRA naming is not standardised. Therefore, it cannot be expected that the base model name can be inferred from the LoRA name. + ## How The following diagram illustrates how an Inference Gateway routes requests to different models based on the model name. @@ -13,7 +15,7 @@ The model name is extracted by [Body-Based routing](https://github.com/kubernete ### Example Model-Aware Routing using Body-Based Routing (BBR) -This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models. +This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models and multiple LoRAs with many to one relationship of LoRAs to the base model. ### Deploy Body-Based Routing Extension @@ -59,7 +61,7 @@ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-i Once this is installed, and after allowing for model download and startup time which can last several minutes, verify that the pod with this 2nd LLM phi4-mini, is running without errors using the command `kubectl get pods`. ### Deploy the 2nd InferencePool and Endpoint Picker Extension -We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints or LORA adapters within each base model. Hence we create these for our second model as follows. +We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints. === "GKE" @@ -242,3 +244,296 @@ kubectl get httproute llm-phi4-route -o yaml }' ``` +### Serving multiple LoRAs per base AI model + +
+⚠️ Known Limitation : LoRA names must be unique across the base AI models (i.e., across the backend inference server deployments) +
+ +Deploy the third base model that is used to demonstrate multiple LoRA configuration per base model. The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous examples. + + +```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment-1.yaml +``` + +Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods` + + +### Deploy the 3rd InferencePool and Endpoint Picker Extension +We also want to use an InferencePool and EndPoint Picker for this third model. + +=== "GKE" + + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Istio" + + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +### Configure HTTPRoute + +Now configure new HTTPRoutes for the two simulated models and their LoRAs that we want to serve via BBR using the following command which configures both routes. + +```bash +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/bbr-example/httproute_bbr_lora.yaml +``` + +Also examine this manifest file (see the yaml below), to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. Note that the manifest file uses two different ways of defining the routes to LoRAs: (1) via adding match clauses on the same base AI model HTTPRoute or by (2) defining a separate HTTPRoutes. There is no functional diffeence between the two methods, except for the limitation on the number of matchers per route imposed by the API Gateway + +
+⚠️ Known Limitation : + +[Kubernetes API Gateway limits the total number of matchers per HTTPRoute to be less than 128](https://github.com/kubernetes-sigs/gateway-api/blob/df8c96c254e1ac6d5f5e0d70617f36143723d479/apis/v1/httproute_types.go#L128). +
+ +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-llama-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-deepseek-route #give this HTTPRoute any name that helps you to group and track the matchers +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-deepseek-r1 + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'deepseek/vllm-deepseek-r1' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'movie-critique' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment + timeouts: + request: 300s +--- +``` + +Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. + +```bash +kubectl get httproute llm-llama-route -o yaml +``` + +```bash +kubectl get httproute llm-deepseek-route -o yaml +``` + +```bash +kubectl vllm-llama3-8b-instruct-lora-food-review-1 -o yaml +``` + +### Try it out + +=== "Chat Completions API" + + 1. Send a few requests to Llama model to test that it works as before, as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Linux is said to be an open source kernel because " + } + ] + }' + ``` + + 2. Send a few requests to the LoRA of the Llama model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review-1", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Write a review of the best restaurans in San-Francisco" + } + ] + }' + ``` + + 3. Send a few requests to one LoRA of the Deepseek model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "movie-critique", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "The best movies of 2025 are" + } + ] + }' + ``` + 4. Send a few requests to another LoRA of the Deepseek model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "The best movies of 2025 are" + } + ] + }' + ``` + +=== "Completions API" + + 1. Send a few requests to Llama model's LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review-1", + "prompt": "Linux is said to be an open source kernel because ", + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 2. Send a few requests to the first Deepseek LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review", + "prompt": "Write as if you were a food critique", + "max_tokens": 20, + "temperature": 0 + }' + ``` + + 3. Send a few requests to the second Deepseek LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "movie-critique", + "prompt": "Write as if you were a movie critique", + "max_tokens": 20, + "temperature": 0 + }' + ``` +