davidbreitgand · davidbreitgand · Sep 14, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 28, 2025
diff --git a/bbr.Dockerfile b/bbr.Dockerfile
@@ -18,6 +18,7 @@ RUN go mod download
 COPY cmd/bbr ./cmd
 COPY pkg ./pkg
 COPY internal ./internal
+COPY api ./api
 WORKDIR /src/cmd
 RUN go build -o /bbr
 

diff --git a/config/charts/body-based-routing/values.yaml b/config/charts/body-based-routing/values.yaml
@@ -4,8 +4,10 @@ bbr:
   image:
     name: bbr
     hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
-    tag: main
-    pullPolicy: Always
+    #tag: main
+    tag: 309637b-dirty #output of git describe --tags --dirty --always
+    #pullPolicy: Always
+    pullPolicy: IfNotPresent #for local development; because the image is on kind
   port: 9004
   healthCheckPort: 9005
 

diff --git a/config/manifests/bbr/bbr-multi-model.yaml b/config/manifests/bbr/bbr-multi-model.yaml
@@ -0,0 +1,33 @@
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: routes-to-llms
+spec:
+  parentRefs:
+  - name: inference-gateway
+  rules:
+  - matches:
+    - headers:
+      - type: Exact
+        #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
+        name: X-Gateway-Model-Name
+        value: meta-llama/Llama-3.1-8B-Instruct
+      path:
+        type: PathPrefix
+        value: /
+    backendRefs:
+    - name: vllm-llama3-8b-instruct
+      group: inference.networking.x-k8s.io
+      kind: InferencePool  
+  - matches:
+    - headers:
+      - type: Exact
+        name: X-Gateway-Model-Name 
+        value: deepseek/deepseek-r1
+      path:
+        type: PathPrefix
+        value: /
+    backendRefs:
+    - name: deepseek-r1
+      group: inference.networking.x-k8s.io
+      kind: InferencePool
diff --git a/config/manifests/vllm/sim-deployment-deepseek.yaml b/config/manifests/vllm/sim-deployment-deepseek.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-deepseek-r1
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm-deepseek-r1
+  template:
+    metadata:
+      labels:
+        app: vllm-deepseek-r1
+    spec:
+      containers:
+      - name: vllm-sim
+        image: ghcr.io/llm-d/llm-d-inference-sim:v0.3.0
+        imagePullPolicy: Always
+        args:
+        - --model
+        - deepseek/deepseek-r1
+        - --port
+        - "8000"
+        - --max-loras
+        - "2"
+        - --lora-modules
+        - '{"name": "food-review"}'
+        - '{"name": "movie-critique"}'
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        ports:
+        - containerPort: 8000
+          name: http
+          protocol: TCP
+        resources:
+          requests:
+            cpu: 10m
diff --git a/config/manifests/vllm/sim-deployment-llama.yaml b/config/manifests/vllm/sim-deployment-llama.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama3-8b-instruct
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct
+    spec:
+      containers:
+      - name: vllm-sim
+        image: ghcr.io/llm-d/llm-d-inference-sim:v0.3.0
+        imagePullPolicy: Always
+        args:
+        - --model
+        - meta-llama/Llama-3.1-8B-Instruct
+        - --port
+        - "8000"
+        - --max-loras
+        - "2"
+        - --lora-modules
+        - '{"name": "food-review-1"}'
+        - '{"name": "food-review-2"}'
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        ports:
+        - containerPort: 8000
+          name: http
+          protocol: TCP
+        resources:
+          requests:
+            cpu: 10m
diff --git a/go.mod b/go.mod
@@ -85,13 +85,18 @@ require (
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
+	github.com/openai/openai-go/v2 v2.4.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/procfs v0.16.1 // indirect
 	github.com/spf13/cobra v1.9.1 // indirect
 	github.com/spf13/pflag v1.0.7 // indirect
 	github.com/stoewer/go-strcase v1.3.0 // indirect
+	github.com/tidwall/gjson v1.18.0 // indirect
+	github.com/tidwall/match v1.2.0 // indirect
+	github.com/tidwall/pretty v1.2.1 // indirect
+	github.com/tidwall/sjson v1.2.5 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect

diff --git a/go.sum b/go.sum
@@ -200,6 +200,8 @@ github.com/onsi/ginkgo/v2 v2.25.1 h1:Fwp6crTREKM+oA6Cz4MsO8RhKQzs2/gOIVOUscMAfZY
 github.com/onsi/ginkgo/v2 v2.25.1/go.mod h1:ppTWQ1dh9KM/F1XgpeRqelR+zHVwV81DGRSDnFxK7Sk=
 github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
 github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
+github.com/openai/openai-go/v2 v2.4.2 h1:TF37Vjq2rX2FmPlnn38rPgfa80V4eKvsmSQz1GeB1M0=
+github.com/openai/openai-go/v2 v2.4.2/go.mod h1:sIUkR+Cu/PMUVkSKhkk742PRURkQOCFhiwJ7eRSBqmk=
 github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
 github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
@@ -244,6 +246,17 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
+github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/match v1.2.0 h1:0pt8FlkOwjN2fPt4bIl4BoNxb98gGHN2ObFEDkrfZnM=
+github.com/tidwall/match v1.2.0/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
+github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
@@ -17,8 +17,10 @@ limitations under the License.
 package handlers
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
+	"os"
 
 	basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
 	eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
@@ -45,6 +47,14 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 		return nil, err
 	}
 
+	//The reason for this additional unmarshal is that I change the model name and then re-marshal RequestBody struct. But it has only one field, and I need to preserve original message at re-marshalling.
+	//This can be done more efficiently if a full "official" struct by OpenAI is used. In OpenAI v2 it should be ChatCompletionNewParams
+	var raw map[string]json.RawMessage
+	if err := json.Unmarshal(requestBodyBytes, &raw); err != nil {
+		metrics.RecordModelNotParsedCounter()
+		return nil, err
+	}
+
 	if requestBody.Model == "" {
 		metrics.RecordModelNotInBodyCounter()
 		logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter")
@@ -68,6 +78,48 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 
 	metrics.RecordSuccessCounter()
 
+	//Mutate model name if it contains reserved keyword "lora" indicating that the requested model is lora (served from the same vLLM as the base model and from the same inferencepool)
+	//Convention: [model-family]/<model-name>/lora/<lora-name>
+	//Model name definition (the vLLM side) does not change: <my-arbitrary-lora-name>
+	//Model name in request (the client side): lora-name (no change from before)
+	loraTag := os.Getenv("LORA_TAG") //set via environment
+	if loraTag == "" {
+		loraTag = "lora"
+	}
+
+	orig := []byte(requestBody.Model)
+	prefix := []byte(requestBody.Model)
+	var suffix []byte
+
+	logger.V(logutil.DEFAULT).Info("Orig: " + string(orig))
+
+	if idx := bytes.Index(orig, []byte(loraTag)); idx != -1 {
+		lastSlash := bytes.LastIndex(orig[:idx], []byte("/"))
+		if lastSlash != -1 {
+			afterTag := orig[idx+len(loraTag):]             // slice after "lora"
+			nextSlash := bytes.Index(afterTag, []byte("/")) // index relative to afterTag
+			if nextSlash != -1 {
+				prefix = orig[:lastSlash] // safe: based on orig
+				// skip the slash itself by adding +1 so suffix doesn't start with '/'
+				suffix = afterTag[nextSlash+1:]
+				logger.V(logutil.DEFAULT).Info("Model name after mutation:" + string(suffix))
+				requestBody.Model = string(suffix)
+				// update only the "model" field in the original raw map so other fields (e.g. prompt) are preserved
+				modelBytes, merr := json.Marshal(requestBody.Model)
+				if merr != nil {
+					logger.V(logutil.DEFAULT).Info("failed to marshal new model value: " + merr.Error())
+				} else {
+					raw["model"] = json.RawMessage(modelBytes)
+					if updatedBodyBytes, merr2 := json.Marshal(raw); merr2 != nil {
+						logger.V(logutil.DEFAULT).Info("failed to marshal updated request body: " + merr2.Error())
+					} else {
+						requestBodyBytes = updatedBodyBytes
+					}
+				}
+			}
+		}
+	}
+
 	if s.streaming {
 		ret = append(ret, &eppb.ProcessingResponse{
 			Response: &eppb.ProcessingResponse_RequestHeaders{
@@ -78,8 +130,9 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 							SetHeaders: []*basepb.HeaderValueOption{
 								{
 									Header: &basepb.HeaderValue{
-										Key:      modelHeader,
-										RawValue: []byte(requestBody.Model),
+										Key: modelHeader,
+										//RawValue: []byte(requestBody.Model),
+										RawValue: prefix,
 									},
 								},
 							},
@@ -103,8 +156,9 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte)
 							SetHeaders: []*basepb.HeaderValueOption{
 								{
 									Header: &basepb.HeaderValue{
-										Key:      modelHeader,
-										RawValue: []byte(requestBody.Model),
+										Key: modelHeader,
+										//RawValue: []byte(requestBody.Model),
+										RawValue: prefix,
 									},
 								},
 							},