kubernetes-sigs
diff --git a/‎apix/v1alpha2/inferenceobjective_types.go‎
Lines changed: 12 additions & 28 deletions b/‎apix/v1alpha2/inferenceobjective_types.go‎
Lines changed: 12 additions & 28 deletions
diff --git a/‎apix/v1alpha2/zz_generated.deepcopy.go‎
Lines changed: 1 addition & 1 deletion b/‎apix/v1alpha2/zz_generated.deepcopy.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎client-go/applyconfiguration/apix/v1alpha2/inferenceobjectivespec.go‎
Lines changed: 2 additions & 6 deletions b/‎client-go/applyconfiguration/apix/v1alpha2/inferenceobjectivespec.go‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml‎
Lines changed: 12 additions & 12 deletions b/‎config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎config/manifests/inferenceobjective.yaml‎
Lines changed: 3 additions & 3 deletions b/‎config/manifests/inferenceobjective.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎config/manifests/regression-testing/inferenceobjective.yaml‎
Lines changed: 16 additions & 16 deletions b/‎config/manifests/regression-testing/inferenceobjective.yaml‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎conformance/tests/epp_unavailable_fail_open.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conformance/tests/epp_unavailable_fail_open.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conformance/tests/gateway_following_epp_routing.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conformance/tests/gateway_following_epp_routing.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/epp/controller/inferenceobjective_reconciler_test.go‎
Lines changed: 3 additions & 3 deletions b/‎pkg/epp/controller/inferenceobjective_reconciler_test.go‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pkg/epp/datastore/datastore_test.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/epp/datastore/datastore_test.go‎
Lines changed: 1 addition & 1 deletion
@@ -64,16 +64,20 @@ type InferenceObjectiveList struct {
 // condition, one will be selected at random.
 type InferenceObjectiveSpec struct {
 
-	// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
-	// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
-	// queuing or rejecting requests of lower criticality. InferenceObjectives of an equivalent Criticality will
-	// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
-	// and the proportionality of fairness will be configurable.
+	// Criticality defines how important it is to serve the request compared to other requests in the same pool.
+	// Criticality is an integer value that defines the priority of the request.
+	// The higher the value, the more critical the request is; negative values _are_ allowed.
+	// No default value is set for this field, allowing for future additions of new fields that may 'one of' with this field.
+	// However, implementations that consume this field (such as the Endpoint Picker) will treat an unset value as '0'.
+	// Criticality is used in flow control, primarily in the event of resource scarcity(reqeusts need to be queued).
+	// All requests will be queued, and flow control will _always_ allow requests of higher criticality to be served first.
+	// Fairness is only enforced and tracked between requests of the same criticality.
 	//
-	// Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
-	// Any implementations that may consume this field may treat an unset value as the 'Standard' range.
+	// Example: requests with Criticality 10 will always be served before
+	// requests with Criticality of 0(the value used if Criticality is unset or no InfereneceObjective is specified).
+	// Similarly requests with a Criticality of -10 will always be served after requests with Criticality of 0.
 	// +optional
-	Criticality *Criticality `json:"criticality,omitempty"`
+	Criticality *int `json:"criticality,omitempty"`
 
 	// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
 	//
@@ -102,26 +106,6 @@ type PoolObjectReference struct {
 	Name ObjectName `json:"name"`
 }
 
-// Criticality defines how important it is to serve the model compared to other models.
-// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.
-// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
-// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
-type Criticality string
-
-const (
-	// Critical defines the highest level of criticality. Requests to this band will be shed last.
-	Critical Criticality = "Critical"
-
-	// Standard defines the base criticality level and is more important than Sheddable but less
-	// important than Critical. Requests in this band will be shed before critical traffic.
-	// Most models are expected to fall within this band.
-	Standard Criticality = "Standard"
-
-	// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
-	// all other bands.
-	Sheddable Criticality = "Sheddable"
-)
-
 // InferenceObjectiveStatus defines the observed state of InferenceObjective
 type InferenceObjectiveStatus struct {
 	// Conditions track the state of the InferenceObjective.
 
@@ -70,19 +70,19 @@ spec:
             properties:
               criticality:
                 description: |-
-                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
-                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
-                  queuing or rejecting requests of lower criticality. InferenceObjectives of an equivalent Criticality will
-                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
-                  and the proportionality of fairness will be configurable.
+                  Criticality defines how important it is to serve the request compared to other requests in the same pool.
+                  Criticality is an integer value that defines the priority of the request.
+                  The higher the value, the more critical the request is; negative values _are_ allowed.
+                  No default value is set for this field, allowing for future additions of new fields that may 'one of' with this field.
+                  However, implementations that consume this field (such as the Endpoint Picker) will treat an unset value as '0'.
+                  Criticality is used in flow control, primarily in the event of resource scarcity(reqeusts need to be queued).
+                  All requests will be queued, and flow control will _always_ allow requests of higher criticality to be served first.
+                  Fairness is only enforced and tracked between requests of the same criticality.
 
-                  Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
-                  Any implementations that may consume this field may treat an unset value as the 'Standard' range.
-                enum:
-                - Critical
-                - Standard
-                - Sheddable
-                type: string
+                  Example: requests with Criticality 10 will always be served before
+                  requests with Criticality of 0(the value used if Criticality is unset or no InfereneceObjective is specified).
+                  Similarly requests with a Criticality of -10 will always be served after requests with Criticality of 0.
+                type: integer
               poolRef:
                 description: PoolRef is a reference to the inference pool, the pool
                   must exist in the same namespace.
 
@@ -3,7 +3,7 @@ kind: InferenceObjective
 metadata:
   name: food-review
 spec:
-  criticality: Standard
+  criticality: 1
   poolRef:
     name: vllm-llama3-8b-instruct
 ---
@@ -12,7 +12,7 @@ kind: InferenceObjective
 metadata:
   name: base-model
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 ---
@@ -21,6 +21,6 @@ kind: InferenceObjective
 metadata:
   name: base-model-cpu
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
@@ -3,7 +3,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-0
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -14,7 +14,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-1
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -25,7 +25,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-2
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -36,7 +36,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-3
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -47,7 +47,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-4
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -58,7 +58,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-5
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -69,7 +69,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-6
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -80,7 +80,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-7
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -91,7 +91,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-8
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -102,7 +102,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-9
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -113,7 +113,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-10
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -124,7 +124,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-11
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -135,7 +135,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-12
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -147,7 +147,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-13
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -159,7 +159,7 @@ kind: InferenceObjective
 metadata:
   name: adapter-14
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
 
@@ -171,6 +171,6 @@ kind: InferenceObjective
 metadata:
   name: base-model
 spec:
-  criticality: Critical
+  criticality: 2
   poolRef:
     name: vllm-llama3-8b-instruct
@@ -6,7 +6,7 @@ metadata:
   name: conformance-fake-model-server
   namespace: gateway-conformance-app-backend
 spec:
-  criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics. 
+  criticality: 2 # Mark criticality high enough to bypass the saturation check since the model server is fake and don't have such metrics. 
   poolRef:
     name: secondary-inference-pool
 ---
 
@@ -6,7 +6,7 @@ metadata:
   name: conformance-fake-model-server
   namespace: gateway-conformance-app-backend
 spec:
-  criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics. 
+  criticality: 2 # Mark criticality high enough to bypass the saturation check since the model server is fake and don't have such metrics.
   poolRef:
     name: primary-inference-pool
 ---
 
@@ -43,7 +43,7 @@ var (
 	pool          = utiltest.MakeInferencePool("test-pool1").Namespace("ns1").ObjRef()
 	infObjective1 = utiltest.MakeInferenceObjective("model1").
 			Namespace(pool.Namespace).
-			Criticality(v1alpha2.Standard).
+			Criticality(1).
 			CreationTimestamp(metav1.Unix(1000, 0)).
 			PoolName(pool.Name).
 			PoolGroup("inference.networking.k8s.io").ObjRef()
@@ -55,7 +55,7 @@ var (
 				PoolGroup("inference.networking.k8s.io").ObjRef()
 	infObjective1Critical = utiltest.MakeInferenceObjective(infObjective1.Name).
 				Namespace(infObjective1.Namespace).
-				Criticality(v1alpha2.Critical).
+				Criticality(2).
 				CreationTimestamp(metav1.Unix(1003, 0)).
 				PoolName(pool.Name).
 				PoolGroup("inference.networking.k8s.io").ObjRef()
@@ -67,7 +67,7 @@ var (
 				PoolGroup("inference.networking.k8s.io").ObjRef()
 	infObjective1DiffGroup = utiltest.MakeInferenceObjective(infObjective1.Name).
 				Namespace(pool.Namespace).
-				Criticality(v1alpha2.Standard).
+				Criticality(1).
 				CreationTimestamp(metav1.Unix(1005, 0)).
 				PoolName(pool.Name).
 				PoolGroup("inference.networking.x-k8s.io").ObjRef()
 
@@ -114,7 +114,7 @@ func TestObjective(t *testing.T) {
 	model2ts := testutil.MakeInferenceObjective("model2").ObjRef()
 	// Same model name as model1ts, newer timestamp
 	model1tsCritical := testutil.MakeInferenceObjective("model1").
-		Criticality(v1alpha2.Critical).ObjRef()
+		Criticality(2).ObjRef()
 	// Same object name as model2ts, different model name.
 	model2chat := testutil.MakeInferenceObjective(model2ts.Name).ObjRef()