Update 002-api-proposal/ to reflect api/v1alpha2 inferencePool and InferenceModel (#870)

shotarok · web-flow · commit 856af6a18832 · 2025-05-28T08:02:18.000-07:00
* Update docs about InferencePool

* Update docs about InferenceModel
diff --git a/docs/proposals/002-api-proposal/README.md b/docs/proposals/002-api-proposal/README.md
@@ -122,13 +122,94 @@ type InferencePool struct {
         metav1.ObjectMeta
         metav1.TypeMeta
 
-        Spec InferencePoolSpec
+        Spec   InferencePoolSpec
+        Status InferencePoolStatus
 }
 
 type InferencePoolSpec struct {
-        // ModelServerSelector uses label selection to watch model server pods
+        // Selector defines a map of labels to watch model server pods
         // that should be included in the InferencePool.
-        ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"`
+        // In some cases, implementations may translate this field to a Service selector, so this matches the simple
+        // map used for Service selectors instead of the full Kubernetes LabelSelector type.
+        // If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool.
+        // Cross namesoace selector is not supported.
+        Selector map[LabelKey]LabelValue `json:"selector"`
+
+        // TargetPortNumber defines the port number to access the selected model servers.
+        // The number must be in the range 1 to 65535.
+        TargetPortNumber int32 `json:"targetPortNumber"`
+
+        // EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint
+        // picker service that picks endpoints for the requests routed to this pool.
+        EndpointPickerConfig `json:",inline"`
+}
+
+// EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension.
+// This type is intended to be a union of mutually exclusive configuration options that we may add in the future.
+type EndpointPickerConfig struct {
+        // Extension configures an endpoint picker as an extension service.
+        ExtensionRef *Extension `json:"extensionRef,omitempty"`
+}
+
+// Extension specifies how to configure an extension that runs the endpoint picker.
+type Extension struct {
+      // Reference is a reference to a service extension.
+      ExtensionReference `json:",inline"`
+
+      // ExtensionConnection configures the connection between the gateway and the extension.
+      ExtensionConnection `json:",inline"`
+}
+
+// ExtensionReference is a reference to the extension deployment.
+type ExtensionReference struct {
+      // Group is the group of the referent.
+      // The default value is "", representing the Core API group.
+      Group *Group `json:"group,omitempty"`
+
+      // Kind is the Kubernetes resource kind of the referent. For example
+      // "Service".
+      //
+      // Defaults to "Service" when not specified.
+      //
+      // ExternalName services can refer to CNAME DNS records that may live
+      // outside of the cluster and as such are difficult to reason about in
+      // terms of conformance. They also may not be safe to forward to (see
+      // CVE-2021-25740 for more information). Implementations MUST NOT
+      // support ExternalName Services.
+      Kind *Kind `json:"kind,omitempty"`
+
+      // Name is the name of the referent.
+      Name ObjectName `json:"name"`
+
+      // The port number on the service running the extension. When unspecified,
+      // implementations SHOULD infer a default value of 9002 when the Kind is
+      // Service.
+      PortNumber *PortNumber `json:"portNumber,omitempty"`
+}
+
+// ExtensionConnection encapsulates options that configures the connection to the extension.
+type ExtensionConnection struct {
+      // Configures how the gateway handles the case when the extension is not responsive.
+      // Defaults to failClose.
+      FailureMode *ExtensionFailureMode `json:"failureMode"`
+}
+
+// ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not
+type ExtensionFailureMode string
+
+
+// PoolStatus defines the observed state of InferencePool from a Gateway.
+type PoolStatus struct {
+      // GatewayRef indicates the gateway that observed state of InferencePool.
+      GatewayRef corev1.ObjectReference `json:"parentRef"`
+
+      // Conditions track the state of the InferencePool.
+      //
+      // Known condition types are:
+      //
+      // * "Accepted"
+      // * "ResolvedRefs"
+      Conditions []metav1.Condition `json:"conditions,omitempty"`
 }
 ```
 
@@ -147,6 +228,7 @@ type InferenceModel struct {
         metav1.TypeMeta
 
         Spec InferenceModelSpec
+        Status InferenceModelStatus
 }
 
 type InferenceModelSpec struct {
@@ -172,22 +254,39 @@ type InferenceModelSpec struct {
         // If not specified, the target model name is defaulted to the ModelName parameter.
         // ModelName is often in reference to a LoRA adapter.
         TargetModels []TargetModel
-        // Reference to the InferencePool that the model registers to. It must exist in the same namespace.
-        PoolReference *LocalObjectReference
+        // PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
+        PoolRef PoolObjectReference
+}
+
+// PoolObjectReference identifies an API object within the namespace of the
+// referrer.
+type PoolObjectReference struct {
+        // Group is the group of the referent.
+        Group Group
+
+        // Kind is kind of the referent. For example "InferencePool".
+        Kind Kind
+
+        // Name is the name of the referent.
+        Name ObjectName
 }
 
 // Defines how important it is to serve the model compared to other models.
 // Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field should ALWAYS be optional(use a pointer), and set no default.
 // This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
 type Criticality string
 const (
-    // Most important. Requests to this band will be shed last.
-    Critical  Criticality = "Critical"
-    // More important than Sheddable, less important than Critical.
-    // Requests in this band will be shed before critical traffic.
-    Default  Criticality = "Default"
-    // Least important. Requests to this band will be shed before all other bands.
-    Sheddable  Criticality = "Sheddable"
+        // Critical defines the highest level of criticality. Requests to this band will be shed last.
+        Critical Criticality = "Critical"
+
+        // Standard defines the base criticality level and is more important than Sheddable but less
+        // important than Critical. Requests in this band will be shed before critical traffic.
+        // Most models are expected to fall within this band.
+        Standard Criticality = "Standard"
+
+        // Sheddable defines the lowest level of criticality. Requests to this band will be shed before
+        // all other bands.
+        Sheddable Criticality = "Sheddable"
  )
 
 // TargetModel represents a deployed model or a LoRA adapter. The
@@ -200,64 +299,62 @@ const (
 type TargetModel struct {
         // The name of the adapter as expected by the ModelServer.
         Name string
-        // Weight is used to determine the percentage of traffic that should be 
+        // Weight is used to determine the percentage of traffic that should be
         // sent to this target model when multiple versions of the model are specified.
-        Weight *int
+        Weight *int32
 }
 
-// LocalObjectReference identifies an API object within the namespace of the
-// referrer.
-type LocalObjectReference struct {
-	// Group is the group of the referent. 
-	Group Group
-
-	// Kind is kind of the referent. For example "InferencePool".
-	Kind Kind
-
-	// Name is the name of the referent.
-	Name ObjectName
+// InferenceModelStatus defines the observed state of InferenceModel
+type InferenceModelStatus struct {
+	// Conditions track the state of the InferenceModel.
+	Conditions []metav1.Condition
 }
-
 ```
 
 ### Yaml Examples
 
 #### InferencePool(s)
 Here we create a pool that selects the appropriate pods
 ```yaml
-apiVersion: inference.x-k8s.io/v1alpha1
+apiVersion: inference.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
   name: base-model-pool
-  modelServerSelector:
-  - app: llm-server
+spec:
+  selector:
+    app: llm-server
+  targetNumber: 8080
+  extensionRef:
+    name: infra-backend-v1-app
 ```
 
 #### InferenceModel
 
 Here we consume the pool with two InferenceModels. Where `sql-code-assist` is both the name of the model and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified criticality. Both `sql-code-assist` and `npc-bot` have available LoRA adapters on the InferencePool and routing to each InferencePool happens earlier (at the K8s Gateway).
 ```yaml
-apiVersion: inference.x-k8s.io/v1alpha1
+apiVersion: inference.x-k8s.io/v1alpha2
 kind: InferenceModel
 metadata:
   name: sql-code-assist
 spec:
   modelName: sql-code-assist
-  poolRef: base-model-pool
+  poolRef:
+    name: base-model-pool
 ---
-apiVersion: inference.x-k8s.io/v1alpha1
+apiVersion: inference.x-k8s.io/v1alpha2
 kind: InferenceModel
 metadata:
   name: npc-bot
 spec:
   modelName: npc-bot
   criticality: Critical
   targetModels:
-    targetModelName: npc-bot-v1
+    - name: npc-bot-v1
+      weight: 50
+    - name: npc-bot-v2
       weight: 50
-    targetModelName: npc-bot-v2
-      weight: 50 	
-  poolRef: base-model-pool
+  poolRef:
+    name: base-model-pool
 ```