-
Notifications
You must be signed in to change notification settings - Fork 0
Support for multiple LoRAs with multiple models via BBR using convent… #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
5ff3c5c
309637b
ba041ea
37e75ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what YAMLs are used by the current guides? Those are not checked in so you created a new one? Same applies to other YAMLs in this PR |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
apiVersion: gateway.networking.k8s.io/v1 | ||
kind: HTTPRoute | ||
metadata: | ||
name: routes-to-llms | ||
spec: | ||
parentRefs: | ||
- name: inference-gateway | ||
rules: | ||
- matches: | ||
- headers: | ||
- type: Exact | ||
#Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. | ||
name: X-Gateway-Model-Name | ||
value: meta-llama/Llama-3.1-8B-Instruct | ||
path: | ||
type: PathPrefix | ||
value: / | ||
backendRefs: | ||
- name: vllm-llama3-8b-instruct | ||
group: inference.networking.x-k8s.io | ||
kind: InferencePool | ||
- matches: | ||
- headers: | ||
- type: Exact | ||
name: X-Gateway-Model-Name | ||
value: deepseek/deepseek-r1 | ||
path: | ||
type: PathPrefix | ||
value: / | ||
backendRefs: | ||
- name: deepseek-r1 | ||
group: inference.networking.x-k8s.io | ||
kind: InferencePool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing newline at EoF |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: vllm-deepseek-r1 | ||
spec: | ||
replicas: 3 | ||
selector: | ||
matchLabels: | ||
app: vllm-deepseek-r1 | ||
template: | ||
metadata: | ||
labels: | ||
app: vllm-deepseek-r1 | ||
spec: | ||
containers: | ||
- name: vllm-sim | ||
image: ghcr.io/llm-d/llm-d-inference-sim:v0.3.0 | ||
imagePullPolicy: Always | ||
args: | ||
- --model | ||
- deepseek/deepseek-r1 | ||
- --port | ||
- "8000" | ||
- --max-loras | ||
- "2" | ||
- --lora-modules | ||
- '{"name": "food-review"}' | ||
- '{"name": "movie-critique"}' | ||
env: | ||
- name: POD_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.name | ||
- name: NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.namespace | ||
ports: | ||
- containerPort: 8000 | ||
name: http | ||
protocol: TCP | ||
resources: | ||
requests: | ||
cpu: 10m |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: vllm-llama3-8b-instruct | ||
spec: | ||
replicas: 3 | ||
selector: | ||
matchLabels: | ||
app: vllm-llama3-8b-instruct | ||
template: | ||
metadata: | ||
labels: | ||
app: vllm-llama3-8b-instruct | ||
spec: | ||
containers: | ||
- name: vllm-sim | ||
image: ghcr.io/llm-d/llm-d-inference-sim:v0.3.0 | ||
imagePullPolicy: Always | ||
args: | ||
- --model | ||
- meta-llama/Llama-3.1-8B-Instruct | ||
- --port | ||
- "8000" | ||
- --max-loras | ||
- "2" | ||
- --lora-modules | ||
- '{"name": "food-review-1"}' | ||
- '{"name": "food-review-2"}' | ||
env: | ||
- name: POD_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.name | ||
- name: NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.namespace | ||
ports: | ||
- containerPort: 8000 | ||
name: http | ||
protocol: TCP | ||
resources: | ||
requests: | ||
cpu: 10m |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,10 @@ limitations under the License. | |
package handlers | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"encoding/json" | ||
"os" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's a IGW package for interacting with env, including setting of defaults, casting to types, etc. |
||
|
||
basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" | ||
eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" | ||
|
@@ -45,6 +47,14 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) | |
return nil, err | ||
} | ||
|
||
//The reason for this additional unmarshal is that I change the model name and then re-marshal RequestBody struct. But it has only one field, and I need to preserve original message at re-marshalling. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing space after |
||
//This can be done more efficiently if a full "official" struct by OpenAI is used. In OpenAI v2 it should be ChatCompletionNewParams | ||
var raw map[string]json.RawMessage | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could unmarshal from/to |
||
if err := json.Unmarshal(requestBodyBytes, &raw); err != nil { | ||
metrics.RecordModelNotParsedCounter() | ||
return nil, err | ||
} | ||
|
||
if requestBody.Model == "" { | ||
metrics.RecordModelNotInBodyCounter() | ||
logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter") | ||
|
@@ -68,6 +78,48 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) | |
|
||
metrics.RecordSuccessCounter() | ||
|
||
//Mutate model name if it contains reserved keyword "lora" indicating that the requested model is lora (served from the same vLLM as the base model and from the same inferencepool) | ||
//Convention: [model-family]/<model-name>/lora/<lora-name> | ||
//Model name definition (the vLLM side) does not change: <my-arbitrary-lora-name> | ||
//Model name in request (the client side): lora-name (no change from before) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
loraTag := os.Getenv("LORA_TAG") //set via environment | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would be preferred, IMO, to make that change in a separate function and not mutate the current implementation. |
||
if loraTag == "" { | ||
loraTag = "lora" | ||
} | ||
|
||
orig := []byte(requestBody.Model) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be easier to parse using |
||
prefix := []byte(requestBody.Model) | ||
var suffix []byte | ||
|
||
logger.V(logutil.DEFAULT).Info("Orig: " + string(orig)) | ||
|
||
if idx := bytes.Index(orig, []byte(loraTag)); idx != -1 { | ||
lastSlash := bytes.LastIndex(orig[:idx], []byte("/")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this assuming adapter name doesn't contain There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I take everything before |
||
if lastSlash != -1 { | ||
afterTag := orig[idx+len(loraTag):] // slice after "lora" | ||
nextSlash := bytes.Index(afterTag, []byte("/")) // index relative to afterTag | ||
if nextSlash != -1 { | ||
prefix = orig[:lastSlash] // safe: based on orig | ||
// skip the slash itself by adding +1 so suffix doesn't start with '/' | ||
suffix = afterTag[nextSlash+1:] | ||
logger.V(logutil.DEFAULT).Info("Model name after mutation:" + string(suffix)) | ||
requestBody.Model = string(suffix) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you should update the content-length header when changing anything in the content There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point |
||
// update only the "model" field in the original raw map so other fields (e.g. prompt) are preserved | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you explain the convention in the PR message? |
||
modelBytes, merr := json.Marshal(requestBody.Model) | ||
if merr != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not reuse err instead of merr and merr2? |
||
logger.V(logutil.DEFAULT).Info("failed to marshal new model value: " + merr.Error()) | ||
} else { | ||
raw["model"] = json.RawMessage(modelBytes) | ||
if updatedBodyBytes, merr2 := json.Marshal(raw); merr2 != nil { | ||
logger.V(logutil.DEFAULT).Info("failed to marshal updated request body: " + merr2.Error()) | ||
} else { | ||
requestBodyBytes = updatedBodyBytes | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
if s.streaming { | ||
ret = append(ret, &eppb.ProcessingResponse{ | ||
Response: &eppb.ProcessingResponse_RequestHeaders{ | ||
|
@@ -78,8 +130,9 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) | |
SetHeaders: []*basepb.HeaderValueOption{ | ||
{ | ||
Header: &basepb.HeaderValue{ | ||
Key: modelHeader, | ||
RawValue: []byte(requestBody.Model), | ||
Key: modelHeader, | ||
//RawValue: []byte(requestBody.Model), | ||
RawValue: prefix, | ||
}, | ||
}, | ||
}, | ||
|
@@ -103,8 +156,9 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestBodyBytes []byte) | |
SetHeaders: []*basepb.HeaderValueOption{ | ||
{ | ||
Header: &basepb.HeaderValue{ | ||
Key: modelHeader, | ||
RawValue: []byte(requestBody.Model), | ||
Key: modelHeader, | ||
//RawValue: []byte(requestBody.Model), | ||
RawValue: prefix, | ||
}, | ||
}, | ||
}, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a few comments
Always
(even when running on kind), since you want newly built images to download to the workers (from Kind, not docker)