diff --git a/.gitignore b/.gitignore index 82afc2e40..4754b915a 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,9 @@ site # MacOS generated files **/.DS_Store + +#archives +*.gz + +#binaries +istioctl diff --git a/bbr.Dockerfile b/bbr.Dockerfile index 36ae378cc..1c294c4c2 100644 --- a/bbr.Dockerfile +++ b/bbr.Dockerfile @@ -18,6 +18,7 @@ RUN go mod download COPY cmd/bbr ./cmd COPY pkg ./pkg COPY internal ./internal +COPY api ./api WORKDIR /src/cmd RUN go build -o /bbr diff --git a/config/charts/body-based-routing/values.yaml b/config/charts/body-based-routing/values.yaml index 0b88dc432..630d0abe4 100644 --- a/config/charts/body-based-routing/values.yaml +++ b/config/charts/body-based-routing/values.yaml @@ -4,8 +4,10 @@ bbr: image: name: bbr hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension - tag: main - pullPolicy: Always + #tag: main + tag: 309637b-dirty #output of git describe --tags --dirty --always + #pullPolicy: Always + pullPolicy: IfNotPresent #for local development; because the image is on kind port: 9004 healthCheckPort: 9005 diff --git a/config/manifests/bbr/bbr-multi-model.yaml b/config/manifests/bbr/bbr-multi-model.yaml new file mode 100644 index 000000000..a12508840 --- /dev/null +++ b/config/manifests/bbr/bbr-multi-model.yaml @@ -0,0 +1,33 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: routes-to-llms +spec: + parentRefs: + - name: inference-gateway + rules: + - matches: + - headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'meta-llama/Llama-3.1-8B-Instruct' + path: + type: PathPrefix + value: / + backendRefs: + - name: vllm-llama3-8b-instruct + group: inference.networking.x-k8s.io + kind: InferencePool + - matches: + - headers: + - type: Exact + name: X-Gateway-Model-Name + value: 'deepseek/deepseek-r1' + path: + type: PathPrefix + value: / + backendRefs: + - name: deepseek-r1 + group: inference.networking.x-k8s.io + kind: InferencePool \ No newline at end of file diff --git a/config/manifests/bbr/multi-model-route.yaml b/config/manifests/bbr/multi-model-route.yaml new file mode 100644 index 000000000..c20746209 --- /dev/null +++ b/config/manifests/bbr/multi-model-route.yaml @@ -0,0 +1,94 @@ +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-llama-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name # (1)! + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-deepseek-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-deepseek-r1 + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'deepseek/deepseek-r1' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'movie-critique' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name # (1)! + value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment + timeouts: + request: 300s \ No newline at end of file diff --git a/config/manifests/vllm/sim-deployment-deepseek.yaml b/config/manifests/vllm/sim-deployment-deepseek.yaml new file mode 100644 index 000000000..d1cbb5827 --- /dev/null +++ b/config/manifests/vllm/sim-deployment-deepseek.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-deepseek-r1 +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-deepseek-r1 + template: + metadata: + labels: + app: vllm-deepseek-r1 + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0 + imagePullPolicy: Always + args: + - --model + - deepseek/deepseek-r1 + - --port + - "8000" + - --max-loras + - "2" + - --lora-modules + - '{"name": "food-review"}' + - '{"name": "movie-critique"}' + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + cpu: 10m diff --git a/config/manifests/vllm/sim-deployment-llama.yaml b/config/manifests/vllm/sim-deployment-llama.yaml new file mode 100644 index 000000000..666fdbcb5 --- /dev/null +++ b/config/manifests/vllm/sim-deployment-llama.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0 + imagePullPolicy: Always + args: + - --model + - meta-llama/Llama-3.1-8B-Instruct + - --port + - "8000" + - --max-loras + - "2" + - --lora-modules + - '{"name": "food-review-1"}' + - '{"name": "food-review-2"}' + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + cpu: 10m diff --git a/conformance/scripts/istio/Makefile b/conformance/scripts/istio/Makefile index 309d8c459..d0d9f1298 100644 --- a/conformance/scripts/istio/Makefile +++ b/conformance/scripts/istio/Makefile @@ -2,7 +2,8 @@ # Example: make all ISTIO_VERSION=1.28.0 GATEWAY_API_VERSION ?= v1.3.0 INFERENCE_EXTENSION_VERSION ?= v0.4.0 -ISTIO_VERSION ?= 1.27-alpha.0551127f00634403cddd4634567e65a8ecc499a7 +#ISTIO_VERSION ?= 1.27-alpha.0551127f00634403cddd4634567e65a8ecc499a7 +ISTIO_VERSION ?= 1.27-alpha.e5a2085c2d34299580f6bb2c124752ba4b1aa651 ISTIO_HUB ?= ISTIO_PROFILE ?= minimal diff --git a/go.mod b/go.mod index 9fcbf1b99..092fe5984 100644 --- a/go.mod +++ b/go.mod @@ -85,6 +85,7 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/openai/openai-go/v2 v2.4.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -92,6 +93,10 @@ require ( github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/pflag v1.0.7 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.2.0 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect diff --git a/go.sum b/go.sum index 1f9a3bb1f..97620ecfe 100644 --- a/go.sum +++ b/go.sum @@ -200,6 +200,8 @@ github.com/onsi/ginkgo/v2 v2.25.1 h1:Fwp6crTREKM+oA6Cz4MsO8RhKQzs2/gOIVOUscMAfZY github.com/onsi/ginkgo/v2 v2.25.1/go.mod h1:ppTWQ1dh9KM/F1XgpeRqelR+zHVwV81DGRSDnFxK7Sk= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= +github.com/openai/openai-go/v2 v2.4.2 h1:TF37Vjq2rX2FmPlnn38rPgfa80V4eKvsmSQz1GeB1M0= +github.com/openai/openai-go/v2 v2.4.2/go.mod h1:sIUkR+Cu/PMUVkSKhkk742PRURkQOCFhiwJ7eRSBqmk= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -244,6 +246,17 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/match v1.2.0 h1:0pt8FlkOwjN2fPt4bIl4BoNxb98gGHN2ObFEDkrfZnM= +github.com/tidwall/match v1.2.0/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go index e7baec6ef..56812fcfe 100644 --- a/pkg/bbr/handlers/request.go +++ b/pkg/bbr/handlers/request.go @@ -17,8 +17,10 @@ limitations under the License. package handlers import ( + "bytes" "context" "encoding/json" + "os" basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" @@ -45,6 +47,14 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) return nil, err } + //The reason for this additional unmarshal is that I change the model name and then re-marshal RequestBody struct. But it has only one field, and I need to preserve original message at re-marshalling. + //This can be done more efficiently if a full "official" struct by OpenAI is used. In OpenAI v2 it should be ChatCompletionNewParams + var raw map[string]json.RawMessage + if err := json.Unmarshal(requestBodyBytes, &raw); err != nil { + metrics.RecordModelNotParsedCounter() + return nil, err + } + if requestBody.Model == "" { metrics.RecordModelNotInBodyCounter() logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter") @@ -68,6 +78,48 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) metrics.RecordSuccessCounter() + //Mutate model name if it contains reserved keyword "lora" indicating that the requested model is lora (served from the same vLLM as the base model and from the same inferencepool) + //Convention: [model-family]//lora/ + //Model name definition (the vLLM side) does not change: + //Model name in request (the client side): lora-name (no change from before) + loraTag := os.Getenv("LORA_TAG") //set via environment + if loraTag == "" { + loraTag = "lora" + } + + orig := []byte(requestBody.Model) + prefix := []byte(requestBody.Model) + var suffix []byte + + logger.V(logutil.DEFAULT).Info("Orig: " + string(orig)) + + if idx := bytes.Index(orig, []byte(loraTag)); idx != -1 { + lastSlash := bytes.LastIndex(orig[:idx], []byte("/")) + if lastSlash != -1 { + afterTag := orig[idx+len(loraTag):] // slice after "lora" + nextSlash := bytes.Index(afterTag, []byte("/")) // index relative to afterTag + if nextSlash != -1 { + prefix = orig[:lastSlash] // safe: based on orig + // skip the slash itself by adding +1 so suffix doesn't start with '/' + suffix = afterTag[nextSlash+1:] + logger.V(logutil.DEFAULT).Info("Model name after mutation:" + string(suffix)) + requestBody.Model = string(suffix) + // update only the "model" field in the original raw map so other fields (e.g. prompt) are preserved + modelBytes, merr := json.Marshal(requestBody.Model) + if merr != nil { + logger.V(logutil.DEFAULT).Info("failed to marshal new model value: " + merr.Error()) + } else { + raw["model"] = json.RawMessage(modelBytes) + if updatedBodyBytes, merr2 := json.Marshal(raw); merr2 != nil { + logger.V(logutil.DEFAULT).Info("failed to marshal updated request body: " + merr2.Error()) + } else { + requestBodyBytes = updatedBodyBytes + } + } + } + } + } + if s.streaming { ret = append(ret, &eppb.ProcessingResponse{ Response: &eppb.ProcessingResponse_RequestHeaders{ @@ -78,8 +130,9 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) SetHeaders: []*basepb.HeaderValueOption{ { Header: &basepb.HeaderValue{ - Key: modelHeader, - RawValue: []byte(requestBody.Model), + Key: modelHeader, + //RawValue: []byte(requestBody.Model), + RawValue: prefix, }, }, }, @@ -103,8 +156,9 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) SetHeaders: []*basepb.HeaderValueOption{ { Header: &basepb.HeaderValue{ - Key: modelHeader, - RawValue: []byte(requestBody.Model), + Key: modelHeader, + //RawValue: []byte(requestBody.Model), + RawValue: prefix, }, }, },