Skip to content

Commit 7dc1fd6

Browse files
authored
Updating model name rewrite to be done by header key (#1331)
1 parent d27a716 commit 7dc1fd6

File tree

18 files changed

+55
-377
lines changed

18 files changed

+55
-377
lines changed

apix/v1alpha2/inferenceobjective_types.go

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,6 @@ type InferenceObjectiveSpec struct {
7575
// +optional
7676
Criticality *Criticality `json:"criticality,omitempty"`
7777

78-
// TargetModels allow multiple versions of a model for traffic splitting.
79-
// If not specified, the target model name is defaulted to the modelName parameter.
80-
// modelName is often in reference to a LoRA adapter.
81-
//
82-
// +optional
83-
// +kubebuilder:validation:MaxItems=10
84-
// +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))"
85-
TargetModels []TargetModel `json:"targetModels,omitempty"`
86-
8778
// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
8879
//
8980
// +kubebuilder:validation:Required
@@ -131,39 +122,6 @@ const (
131122
Sheddable Criticality = "Sheddable"
132123
)
133124

134-
// TargetModel represents a deployed model or a LoRA adapter. The
135-
// Name field is expected to match the name of the LoRA adapter
136-
// (or base model) as it is registered within the model server. Inference
137-
// Gateway assumes that the model exists on the model server and it's the
138-
// responsibility of the user to validate a correct match. Should a model fail
139-
// to exist at request time, the error is processed by the Inference Gateway
140-
// and emitted on the appropriate InferenceObjective object.
141-
type TargetModel struct {
142-
// Name is the name of the adapter or base model, as expected by the ModelServer.
143-
//
144-
// +kubebuilder:validation:MaxLength=253
145-
// +kubebuilder:validation:Required
146-
Name string `json:"name"`
147-
148-
// Weight is used to determine the proportion of traffic that should be
149-
// sent to this model when multiple target models are specified.
150-
//
151-
// Weight defines the proportion of requests forwarded to the specified
152-
// model. This is computed as weight/(sum of all weights in this
153-
// TargetModels list). For non-zero values, there may be some epsilon from
154-
// the exact proportion defined here depending on the precision an
155-
// implementation supports. Weight is not a percentage and the sum of
156-
// weights does not need to equal 100.
157-
//
158-
// If a weight is set for any targetModel, it must be set for all targetModels.
159-
// Conversely weights are optional, so long as ALL targetModels do not specify a weight.
160-
//
161-
// +optional
162-
// +kubebuilder:validation:Minimum=1
163-
// +kubebuilder:validation:Maximum=1000000
164-
Weight *int32 `json:"weight,omitempty"`
165-
}
166-
167125
// InferenceObjectiveStatus defines the observed state of InferenceObjective
168126
type InferenceObjectiveStatus struct {
169127
// Conditions track the state of the InferenceObjective.

apix/v1alpha2/zz_generated.deepcopy.go

Lines changed: 0 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/apix/v1alpha2/inferenceobjectivespec.go

Lines changed: 2 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/apix/v1alpha2/targetmodel.go

Lines changed: 0 additions & 48 deletions
This file was deleted.

client-go/applyconfiguration/utils.go

Lines changed: 0 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -108,52 +108,6 @@ spec:
108108
required:
109109
- name
110110
type: object
111-
targetModels:
112-
description: |-
113-
TargetModels allow multiple versions of a model for traffic splitting.
114-
If not specified, the target model name is defaulted to the modelName parameter.
115-
modelName is often in reference to a LoRA adapter.
116-
items:
117-
description: |-
118-
TargetModel represents a deployed model or a LoRA adapter. The
119-
Name field is expected to match the name of the LoRA adapter
120-
(or base model) as it is registered within the model server. Inference
121-
Gateway assumes that the model exists on the model server and it's the
122-
responsibility of the user to validate a correct match. Should a model fail
123-
to exist at request time, the error is processed by the Inference Gateway
124-
and emitted on the appropriate InferenceObjective object.
125-
properties:
126-
name:
127-
description: Name is the name of the adapter or base model,
128-
as expected by the ModelServer.
129-
maxLength: 253
130-
type: string
131-
weight:
132-
description: |-
133-
Weight is used to determine the proportion of traffic that should be
134-
sent to this model when multiple target models are specified.
135-
136-
Weight defines the proportion of requests forwarded to the specified
137-
model. This is computed as weight/(sum of all weights in this
138-
TargetModels list). For non-zero values, there may be some epsilon from
139-
the exact proportion defined here depending on the precision an
140-
implementation supports. Weight is not a percentage and the sum of
141-
weights does not need to equal 100.
142-
143-
If a weight is set for any targetModel, it must be set for all targetModels.
144-
Conversely weights are optional, so long as ALL targetModels do not specify a weight.
145-
format: int32
146-
maximum: 1000000
147-
minimum: 1
148-
type: integer
149-
required:
150-
- name
151-
type: object
152-
maxItems: 10
153-
type: array
154-
x-kubernetes-validations:
155-
- message: Weights should be set for all models, or none of the models.
156-
rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
157111
required:
158112
- poolRef
159113
type: object

config/manifests/inferenceobjective.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@ spec:
66
criticality: Standard
77
poolRef:
88
name: vllm-llama3-8b-instruct
9-
targetModels:
10-
- name: food-review-1
11-
weight: 100
129
---
1310
apiVersion: inference.networking.x-k8s.io/v1alpha2
1411
kind: InferenceObjective

config/manifests/regression-testing/inferenceobjective.yaml

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@ spec:
66
criticality: Critical
77
poolRef:
88
name: vllm-llama3-8b-instruct
9-
targetModels:
10-
- name: adapter-0
11-
weight: 100
129

1310
---
1411

@@ -20,9 +17,6 @@ spec:
2017
criticality: Critical
2118
poolRef:
2219
name: vllm-llama3-8b-instruct
23-
targetModels:
24-
- name: adapter-1
25-
weight: 100
2620

2721
---
2822

@@ -34,9 +28,6 @@ spec:
3428
criticality: Critical
3529
poolRef:
3630
name: vllm-llama3-8b-instruct
37-
targetModels:
38-
- name: adapter-2
39-
weight: 100
4031

4132
---
4233

@@ -48,9 +39,6 @@ spec:
4839
criticality: Critical
4940
poolRef:
5041
name: vllm-llama3-8b-instruct
51-
targetModels:
52-
- name: adapter-3
53-
weight: 100
5442

5543
---
5644

@@ -62,9 +50,6 @@ spec:
6250
criticality: Critical
6351
poolRef:
6452
name: vllm-llama3-8b-instruct
65-
targetModels:
66-
- name: adapter-4
67-
weight: 100
6853

6954
---
7055

@@ -76,9 +61,6 @@ spec:
7661
criticality: Critical
7762
poolRef:
7863
name: vllm-llama3-8b-instruct
79-
targetModels:
80-
- name: adapter-5
81-
weight: 100
8264

8365
---
8466

@@ -90,9 +72,6 @@ spec:
9072
criticality: Critical
9173
poolRef:
9274
name: vllm-llama3-8b-instruct
93-
targetModels:
94-
- name: adapter-6
95-
weight: 100
9675

9776
---
9877

@@ -104,9 +83,6 @@ spec:
10483
criticality: Critical
10584
poolRef:
10685
name: vllm-llama3-8b-instruct
107-
targetModels:
108-
- name: adapter-7
109-
weight: 100
11086

11187
---
11288

@@ -118,9 +94,6 @@ spec:
11894
criticality: Critical
11995
poolRef:
12096
name: vllm-llama3-8b-instruct
121-
targetModels:
122-
- name: adapter-8
123-
weight: 100
12497

12598
---
12699

@@ -132,9 +105,6 @@ spec:
132105
criticality: Critical
133106
poolRef:
134107
name: vllm-llama3-8b-instruct
135-
targetModels:
136-
- name: adapter-9
137-
weight: 100
138108

139109
---
140110

@@ -146,9 +116,6 @@ spec:
146116
criticality: Critical
147117
poolRef:
148118
name: vllm-llama3-8b-instruct
149-
targetModels:
150-
- name: adapter-10
151-
weight: 100
152119

153120
---
154121

@@ -160,9 +127,6 @@ spec:
160127
criticality: Critical
161128
poolRef:
162129
name: vllm-llama3-8b-instruct
163-
targetModels:
164-
- name: adapter-11
165-
weight: 100
166130

167131
---
168132

@@ -174,9 +138,6 @@ spec:
174138
criticality: Critical
175139
poolRef:
176140
name: vllm-llama3-8b-instruct
177-
targetModels:
178-
- name: adapter-12
179-
weight: 100
180141

181142

182143
---
@@ -189,9 +150,6 @@ spec:
189150
criticality: Critical
190151
poolRef:
191152
name: vllm-llama3-8b-instruct
192-
targetModels:
193-
- name: adapter-13
194-
weight: 100
195153

196154

197155
---
@@ -204,9 +162,6 @@ spec:
204162
criticality: Critical
205163
poolRef:
206164
name: vllm-llama3-8b-instruct
207-
targetModels:
208-
- name: adapter-14
209-
weight: 100
210165

211166
---
212167

0 commit comments

Comments
 (0)