Skip to content

Commit 928e051

Browse files
authored
Modifying Criticality to an int (#1348)
1 parent 8eb1e31 commit 928e051

19 files changed

+80
-97
lines changed

apix/v1alpha2/inferenceobjective_types.go

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,20 @@ type InferenceObjectiveList struct {
6464
// condition, one will be selected at random.
6565
type InferenceObjectiveSpec struct {
6666

67-
// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
68-
// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
69-
// queuing or rejecting requests of lower criticality. InferenceObjectives of an equivalent Criticality will
70-
// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
71-
// and the proportionality of fairness will be configurable.
67+
// Criticality defines how important it is to serve the request compared to other requests in the same pool.
68+
// Criticality is an integer value that defines the priority of the request.
69+
// The higher the value, the more critical the request is; negative values _are_ allowed.
70+
// No default value is set for this field, allowing for future additions of new fields that may 'one of' with this field.
71+
// However, implementations that consume this field (such as the Endpoint Picker) will treat an unset value as '0'.
72+
// Criticality is used in flow control, primarily in the event of resource scarcity(reqeusts need to be queued).
73+
// All requests will be queued, and flow control will _always_ allow requests of higher criticality to be served first.
74+
// Fairness is only enforced and tracked between requests of the same criticality.
7275
//
73-
// Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
74-
// Any implementations that may consume this field may treat an unset value as the 'Standard' range.
76+
// Example: requests with Criticality 10 will always be served before
77+
// requests with Criticality of 0(the value used if Criticality is unset or no InfereneceObjective is specified).
78+
// Similarly requests with a Criticality of -10 will always be served after requests with Criticality of 0.
7579
// +optional
76-
Criticality *Criticality `json:"criticality,omitempty"`
80+
Criticality *int `json:"criticality,omitempty"`
7781

7882
// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
7983
//
@@ -102,26 +106,6 @@ type PoolObjectReference struct {
102106
Name ObjectName `json:"name"`
103107
}
104108

105-
// Criticality defines how important it is to serve the model compared to other models.
106-
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.
107-
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
108-
// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
109-
type Criticality string
110-
111-
const (
112-
// Critical defines the highest level of criticality. Requests to this band will be shed last.
113-
Critical Criticality = "Critical"
114-
115-
// Standard defines the base criticality level and is more important than Sheddable but less
116-
// important than Critical. Requests in this band will be shed before critical traffic.
117-
// Most models are expected to fall within this band.
118-
Standard Criticality = "Standard"
119-
120-
// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
121-
// all other bands.
122-
Sheddable Criticality = "Sheddable"
123-
)
124-
125109
// InferenceObjectiveStatus defines the observed state of InferenceObjective
126110
type InferenceObjectiveStatus struct {
127111
// Conditions track the state of the InferenceObjective.

apix/v1alpha2/zz_generated.deepcopy.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/apix/v1alpha2/inferenceobjectivespec.go

Lines changed: 2 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -70,19 +70,19 @@ spec:
7070
properties:
7171
criticality:
7272
description: |-
73-
Criticality defines how important it is to serve the model compared to other models referencing the same pool.
74-
Criticality impacts how traffic is handled in resource constrained situations. It handles this by
75-
queuing or rejecting requests of lower criticality. InferenceObjectives of an equivalent Criticality will
76-
fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
77-
and the proportionality of fairness will be configurable.
73+
Criticality defines how important it is to serve the request compared to other requests in the same pool.
74+
Criticality is an integer value that defines the priority of the request.
75+
The higher the value, the more critical the request is; negative values _are_ allowed.
76+
No default value is set for this field, allowing for future additions of new fields that may 'one of' with this field.
77+
However, implementations that consume this field (such as the Endpoint Picker) will treat an unset value as '0'.
78+
Criticality is used in flow control, primarily in the event of resource scarcity(reqeusts need to be queued).
79+
All requests will be queued, and flow control will _always_ allow requests of higher criticality to be served first.
80+
Fairness is only enforced and tracked between requests of the same criticality.
7881
79-
Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
80-
Any implementations that may consume this field may treat an unset value as the 'Standard' range.
81-
enum:
82-
- Critical
83-
- Standard
84-
- Sheddable
85-
type: string
82+
Example: requests with Criticality 10 will always be served before
83+
requests with Criticality of 0(the value used if Criticality is unset or no InfereneceObjective is specified).
84+
Similarly requests with a Criticality of -10 will always be served after requests with Criticality of 0.
85+
type: integer
8686
poolRef:
8787
description: PoolRef is a reference to the inference pool, the pool
8888
must exist in the same namespace.

config/manifests/inferenceobjective.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ kind: InferenceObjective
33
metadata:
44
name: food-review
55
spec:
6-
criticality: Standard
6+
criticality: 1
77
poolRef:
88
name: vllm-llama3-8b-instruct
99
---
@@ -12,7 +12,7 @@ kind: InferenceObjective
1212
metadata:
1313
name: base-model
1414
spec:
15-
criticality: Critical
15+
criticality: 2
1616
poolRef:
1717
name: vllm-llama3-8b-instruct
1818
---
@@ -21,6 +21,6 @@ kind: InferenceObjective
2121
metadata:
2222
name: base-model-cpu
2323
spec:
24-
criticality: Critical
24+
criticality: 2
2525
poolRef:
2626
name: vllm-llama3-8b-instruct

config/manifests/regression-testing/inferenceobjective.yaml

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ kind: InferenceObjective
33
metadata:
44
name: adapter-0
55
spec:
6-
criticality: Critical
6+
criticality: 2
77
poolRef:
88
name: vllm-llama3-8b-instruct
99

@@ -14,7 +14,7 @@ kind: InferenceObjective
1414
metadata:
1515
name: adapter-1
1616
spec:
17-
criticality: Critical
17+
criticality: 2
1818
poolRef:
1919
name: vllm-llama3-8b-instruct
2020

@@ -25,7 +25,7 @@ kind: InferenceObjective
2525
metadata:
2626
name: adapter-2
2727
spec:
28-
criticality: Critical
28+
criticality: 2
2929
poolRef:
3030
name: vllm-llama3-8b-instruct
3131

@@ -36,7 +36,7 @@ kind: InferenceObjective
3636
metadata:
3737
name: adapter-3
3838
spec:
39-
criticality: Critical
39+
criticality: 2
4040
poolRef:
4141
name: vllm-llama3-8b-instruct
4242

@@ -47,7 +47,7 @@ kind: InferenceObjective
4747
metadata:
4848
name: adapter-4
4949
spec:
50-
criticality: Critical
50+
criticality: 2
5151
poolRef:
5252
name: vllm-llama3-8b-instruct
5353

@@ -58,7 +58,7 @@ kind: InferenceObjective
5858
metadata:
5959
name: adapter-5
6060
spec:
61-
criticality: Critical
61+
criticality: 2
6262
poolRef:
6363
name: vllm-llama3-8b-instruct
6464

@@ -69,7 +69,7 @@ kind: InferenceObjective
6969
metadata:
7070
name: adapter-6
7171
spec:
72-
criticality: Critical
72+
criticality: 2
7373
poolRef:
7474
name: vllm-llama3-8b-instruct
7575

@@ -80,7 +80,7 @@ kind: InferenceObjective
8080
metadata:
8181
name: adapter-7
8282
spec:
83-
criticality: Critical
83+
criticality: 2
8484
poolRef:
8585
name: vllm-llama3-8b-instruct
8686

@@ -91,7 +91,7 @@ kind: InferenceObjective
9191
metadata:
9292
name: adapter-8
9393
spec:
94-
criticality: Critical
94+
criticality: 2
9595
poolRef:
9696
name: vllm-llama3-8b-instruct
9797

@@ -102,7 +102,7 @@ kind: InferenceObjective
102102
metadata:
103103
name: adapter-9
104104
spec:
105-
criticality: Critical
105+
criticality: 2
106106
poolRef:
107107
name: vllm-llama3-8b-instruct
108108

@@ -113,7 +113,7 @@ kind: InferenceObjective
113113
metadata:
114114
name: adapter-10
115115
spec:
116-
criticality: Critical
116+
criticality: 2
117117
poolRef:
118118
name: vllm-llama3-8b-instruct
119119

@@ -124,7 +124,7 @@ kind: InferenceObjective
124124
metadata:
125125
name: adapter-11
126126
spec:
127-
criticality: Critical
127+
criticality: 2
128128
poolRef:
129129
name: vllm-llama3-8b-instruct
130130

@@ -135,7 +135,7 @@ kind: InferenceObjective
135135
metadata:
136136
name: adapter-12
137137
spec:
138-
criticality: Critical
138+
criticality: 2
139139
poolRef:
140140
name: vllm-llama3-8b-instruct
141141

@@ -147,7 +147,7 @@ kind: InferenceObjective
147147
metadata:
148148
name: adapter-13
149149
spec:
150-
criticality: Critical
150+
criticality: 2
151151
poolRef:
152152
name: vllm-llama3-8b-instruct
153153

@@ -159,7 +159,7 @@ kind: InferenceObjective
159159
metadata:
160160
name: adapter-14
161161
spec:
162-
criticality: Critical
162+
criticality: 2
163163
poolRef:
164164
name: vllm-llama3-8b-instruct
165165

@@ -171,6 +171,6 @@ kind: InferenceObjective
171171
metadata:
172172
name: base-model
173173
spec:
174-
criticality: Critical
174+
criticality: 2
175175
poolRef:
176176
name: vllm-llama3-8b-instruct

conformance/tests/epp_unavailable_fail_open.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ metadata:
66
name: conformance-fake-model-server
77
namespace: gateway-conformance-app-backend
88
spec:
9-
criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics.
9+
criticality: 2 # Mark criticality high enough to bypass the saturation check since the model server is fake and don't have such metrics.
1010
poolRef:
1111
name: secondary-inference-pool
1212
---

conformance/tests/gateway_following_epp_routing.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ metadata:
66
name: conformance-fake-model-server
77
namespace: gateway-conformance-app-backend
88
spec:
9-
criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics.
9+
criticality: 2 # Mark criticality high enough to bypass the saturation check since the model server is fake and don't have such metrics.
1010
poolRef:
1111
name: primary-inference-pool
1212
---

pkg/epp/controller/inferenceobjective_reconciler_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ var (
4343
pool = utiltest.MakeInferencePool("test-pool1").Namespace("ns1").ObjRef()
4444
infObjective1 = utiltest.MakeInferenceObjective("model1").
4545
Namespace(pool.Namespace).
46-
Criticality(v1alpha2.Standard).
46+
Criticality(1).
4747
CreationTimestamp(metav1.Unix(1000, 0)).
4848
PoolName(pool.Name).
4949
PoolGroup("inference.networking.k8s.io").ObjRef()
@@ -55,7 +55,7 @@ var (
5555
PoolGroup("inference.networking.k8s.io").ObjRef()
5656
infObjective1Critical = utiltest.MakeInferenceObjective(infObjective1.Name).
5757
Namespace(infObjective1.Namespace).
58-
Criticality(v1alpha2.Critical).
58+
Criticality(2).
5959
CreationTimestamp(metav1.Unix(1003, 0)).
6060
PoolName(pool.Name).
6161
PoolGroup("inference.networking.k8s.io").ObjRef()
@@ -67,7 +67,7 @@ var (
6767
PoolGroup("inference.networking.k8s.io").ObjRef()
6868
infObjective1DiffGroup = utiltest.MakeInferenceObjective(infObjective1.Name).
6969
Namespace(pool.Namespace).
70-
Criticality(v1alpha2.Standard).
70+
Criticality(1).
7171
CreationTimestamp(metav1.Unix(1005, 0)).
7272
PoolName(pool.Name).
7373
PoolGroup("inference.networking.x-k8s.io").ObjRef()

pkg/epp/datastore/datastore_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ func TestObjective(t *testing.T) {
114114
model2ts := testutil.MakeInferenceObjective("model2").ObjRef()
115115
// Same model name as model1ts, newer timestamp
116116
model1tsCritical := testutil.MakeInferenceObjective("model1").
117-
Criticality(v1alpha2.Critical).ObjRef()
117+
Criticality(2).ObjRef()
118118
// Same object name as model2ts, different model name.
119119
model2chat := testutil.MakeInferenceObjective(model2ts.Name).ObjRef()
120120

0 commit comments

Comments
 (0)