diff --git a/Makefile b/Makefile
index f2e6ab913..023ed0e5f 100644
--- a/Makefile
+++ b/Makefile
@@ -298,16 +298,16 @@ live-docs:
docker run --rm -it -p 3000:3000 -v ${PWD}:/docs gaie/mkdocs
.PHONY: apix-ref-docs
-apix-ref-docs:
- crd-ref-docs \
+apix-ref-docs: crd-ref-docs
+ ${CRD_REF_DOCS} \
--source-path=${PWD}/apix/v1alpha2 \
--config=crd-ref-docs.yaml \
--renderer=markdown \
--output-path=${PWD}/site-src/reference/x-spec.md
.PHONY: api-ref-docs
-api-ref-docs:
- crd-ref-docs \
+api-ref-docs: crd-ref-docs
+ ${CRD_REF_DOCS} \
--source-path=${PWD}/api \
--config=crd-ref-docs.yaml \
--renderer=markdown \
@@ -364,6 +364,7 @@ KUBECTL ?= kubectl
KUSTOMIZE ?= $(LOCALBIN)/kustomize
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
ENVTEST ?= $(LOCALBIN)/setup-envtest
+CRD_REF_DOCS ?= $(LOCALBIN)/crd-ref-docs
GOLANGCI_LINT = $(LOCALBIN)/golangci-lint
HELM = $(PROJECT_DIR)/bin/helm
YQ = $(PROJECT_DIR)/bin/yq
@@ -374,6 +375,7 @@ GCI = $(LOCALBIN)/gci
KUSTOMIZE_VERSION ?= v5.4.3
CONTROLLER_TOOLS_VERSION ?= v0.16.1
ENVTEST_VERSION ?= release-0.19
+CRD_REF_DOCS_VERSION ?= v0.2.0
GOLANGCI_LINT_VERSION ?= v2.3.0
HELM_VERSION ?= v3.17.1
KUBECTL_VALIDATE_VERSION ?= v0.0.4
@@ -390,6 +392,11 @@ controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessar
$(CONTROLLER_GEN): $(LOCALBIN)
$(call go-install-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen,$(CONTROLLER_TOOLS_VERSION))
+.PHONY: crd-ref-docs
+crd-ref-docs: $(CRD_REF_DOCS) ## Download crd-ref-docs locally if necessary.
+$(CRD_REF_DOCS): $(LOCALBIN)
+ $(call go-install-tool,$(CRD_REF_DOCS),github.com/elastic/crd-ref-docs,$(CRD_REF_DOCS_VERSION))
+
.PHONY: envtest
envtest: $(ENVTEST) ## Download setup-envtest locally if necessary.
$(ENVTEST): $(LOCALBIN)
diff --git a/site-src/reference/spec.md b/site-src/reference/spec.md
index 4260597f2..9b31f4470 100644
--- a/site-src/reference/spec.md
+++ b/site-src/reference/spec.md
@@ -15,23 +15,6 @@ inference.networking.k8s.io API group.
-#### EndpointPickerConfig
-
-
-
-EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension.
-This type is intended to be a union of mutually exclusive configuration options that we may add in the future.
-
-
-
-_Appears in:_
-- [InferencePoolSpec](#inferencepoolspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
|
-
-
#### Extension
@@ -41,34 +24,17 @@ Extension specifies how to configure an extension that runs the endpoint picker.
_Appears in:_
-- [EndpointPickerConfig](#endpointpickerconfig)
- [InferencePoolSpec](#inferencepoolspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
+| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
MinLength: 0
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent.
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
|
-| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
|
+| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
|
| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
|
| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
|
-#### ExtensionConnection
-
-
-
-ExtensionConnection encapsulates options that configures the connection to the extension.
-
-
-
-_Appears in:_
-- [Extension](#extension)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
|
-
-
#### ExtensionFailureMode
_Underlying type:_ _string_
@@ -81,7 +47,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionConnection](#extensionconnection)
| Field | Description |
| --- | --- |
@@ -89,29 +54,6 @@ _Appears in:_
| `FailClose` | FailClose specifies that the proxy should drop the request when the Endpoint Picker fails.
|
-#### ExtensionReference
-
-
-
-ExtensionReference is a reference to the extension.
-
-If a reference is invalid, the implementation MUST update the `ResolvedRefs`
-Condition on the InferencePool's status to `status: False`. A 5XX status code MUST be returned
-for the request that would have otherwise been routed to the invalid backend.
-
-
-
-_Appears in:_
-- [Extension](#extension)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
-| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent.
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
|
-| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
|
-| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
|
-
-
#### Group
_Underlying type:_ _string_
@@ -134,11 +76,11 @@ Invalid values include:
_Validation:_
- MaxLength: 253
+- MinLength: 0
- Pattern: `^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
- [ParentGatewayReference](#parentgatewayreference)
@@ -160,7 +102,7 @@ InferencePool is the Schema for the InferencePools API.
| `kind` _string_ | `InferencePool` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[InferencePoolSpec](#inferencepoolspec)_ | | | |
-| `status` _[InferencePoolStatus](#inferencepoolstatus)_ | Status defines the observed state of InferencePool. | \{ parent:[map[conditions:[map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Accepted]] parentRef:map[kind:Status name:default]]] \} | |
+| `status` _[InferencePoolStatus](#inferencepoolstatus)_ | Status defines the observed state of InferencePool. | \{ parent:[map[conditions:[map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Accepted]] parentRef:map[kind:Status name:default]]] \} | MinProperties: 1
|
@@ -180,9 +122,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `selector` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | Selector defines a map of labels to watch model server Pods
that should be included in the InferencePool.
In some cases, implementations may translate this field to a Service selector, so this matches the simple
map used for Service selectors instead of the full Kubernetes LabelSelector type.
If specified, it will be applied to match the model server pods in the same namespace as the InferencePool.
Cross namesoace selector is not supported. | | Required: \{\}
|
-| `targetPortNumber` _integer_ | TargetPortNumber defines the port number to access the selected model server Pods.
The number must be in the range 1 to 65535. | | Maximum: 65535
Minimum: 1
Required: \{\}
|
-| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
|
+| `selector` _[LabelSelector](#labelselector)_ | Selector determines which Pods are members of this inference pool.
It matches Pods by their labels only within the same namespace; cross-namespace
selection is not supported.
The structure of this LabelSelector is intentionally simple to be compatible
with Kubernetes Service selectors, as some implementations may translate
this configuration into a Service resource. | | |
+| `targetPorts` _[Port](#port) array_ | TargetPorts defines a list of ports that are exposed by this InferencePool.
Currently, the list may only include a single port definition. | | MaxItems: 1
MinItems: 1
|
+| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | |
#### InferencePoolStatus
@@ -191,7 +133,8 @@ _Appears in:_
InferencePoolStatus defines the observed state of InferencePool.
-
+_Validation:_
+- MinProperties: 1
_Appears in:_
- [InferencePool](#inferencepool)
@@ -223,7 +166,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
- [ParentGatewayReference](#parentgatewayreference)
@@ -256,9 +198,26 @@ _Validation:_
- MinLength: 1
- Pattern: `^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$`
+_Appears in:_
+- [LabelSelector](#labelselector)
+
+
+
+#### LabelSelector
+
+
+
+LabelSelector defines a query for resources based on their labels.
+This simplified version uses only the matchLabels field.
+
+
+
_Appears in:_
- [InferencePoolSpec](#inferencepoolspec)
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `matchLabels` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | matchLabels contains a set of required \{key,value\} pairs.
An object must match every label in this map to be selected.
The matching logic is an AND operation on all entries. | | MaxItems: 64
|
#### LabelValue
@@ -283,7 +242,7 @@ _Validation:_
- Pattern: `^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`
_Appears in:_
-- [InferencePoolSpec](#inferencepoolspec)
+- [LabelSelector](#labelselector)
@@ -331,7 +290,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
- [ParentGatewayReference](#parentgatewayreference)
@@ -350,7 +308,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `group` _[Group](#group)_ | Group is the group of the referent. | gateway.networking.k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
+| `group` _[Group](#group)_ | Group is the group of the referent. | gateway.networking.k8s.io | MaxLength: 253
MinLength: 0
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
| `kind` _[Kind](#kind)_ | Kind is kind of the referent. For example "Gateway". | Gateway | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
|
| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
|
| `namespace` _[Namespace](#namespace)_ | Namespace is the namespace of the referent. If not present,
the namespace of the referent is assumed to be the same as
the namespace of the referring object. | | MaxLength: 63
MinLength: 1
Pattern: `^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`
|
@@ -369,8 +327,24 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `parentRef` _[ParentGatewayReference](#parentgatewayreference)_ | GatewayRef indicates the gateway that observed state of InferencePool. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool.
Known condition types are:
* "Accepted"
* "ResolvedRefs" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Accepted]] | MaxItems: 8
|
+| `parentRef` _[ParentGatewayReference](#parentgatewayreference)_ | GatewayRef indicates the gateway that observed state of InferencePool. | | |
+
+
+#### Port
+
+
+
+Port defines the network port that will be exposed by this InferencePool.
+
+
+
+_Appears in:_
+- [InferencePoolSpec](#inferencepoolspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `number` _[PortNumber](#portnumber)_ | Number defines the port number to access the selected model server Pods.
The number must be in the range 1 to 65535. | | Maximum: 65535
Minimum: 1
|
#### PortNumber
@@ -385,7 +359,7 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
+- [Port](#port)
diff --git a/site-src/reference/x-spec.md b/site-src/reference/x-spec.md
index 5791e8df7..bb91323db 100644
--- a/site-src/reference/x-spec.md
+++ b/site-src/reference/x-spec.md
@@ -11,49 +11,11 @@ inference.networking.x-k8s.io API group.
### Resource Types
-- [InferenceModel](#inferencemodel)
+- [InferenceObjective](#inferenceobjective)
- [InferencePool](#inferencepool)
-#### Criticality
-
-_Underlying type:_ _string_
-
-Criticality defines how important it is to serve the model compared to other models.
-Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default.
-This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
-
-_Validation:_
-- Enum: [Critical Standard Sheddable]
-
-_Appears in:_
-- [InferenceModelSpec](#inferencemodelspec)
-
-| Field | Description |
-| --- | --- |
-| `Critical` | Critical defines the highest level of criticality. Requests to this band will be shed last.
|
-| `Standard` | Standard defines the base criticality level and is more important than Sheddable but less
important than Critical. Requests in this band will be shed before critical traffic.
Most models are expected to fall within this band.
|
-| `Sheddable` | Sheddable defines the lowest level of criticality. Requests to this band will be shed before
all other bands.
|
-
-
-#### EndpointPickerConfig
-
-
-
-EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension.
-This type is intended to be a union of mutually exclusive configuration options that we may add in the future.
-
-
-
-_Appears in:_
-- [InferencePoolSpec](#inferencepoolspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
|
-
-
#### Extension
@@ -63,7 +25,6 @@ Extension specifies how to configure an extension that runs the endpoint picker.
_Appears in:_
-- [EndpointPickerConfig](#endpointpickerconfig)
- [InferencePoolSpec](#inferencepoolspec)
| Field | Description | Default | Validation |
@@ -75,22 +36,6 @@ _Appears in:_
| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
|
-#### ExtensionConnection
-
-
-
-ExtensionConnection encapsulates options that configures the connection to the extension.
-
-
-
-_Appears in:_
-- [Extension](#extension)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
|
-
-
#### ExtensionFailureMode
_Underlying type:_ _string_
@@ -103,7 +48,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionConnection](#extensionconnection)
| Field | Description |
| --- | --- |
@@ -111,34 +55,6 @@ _Appears in:_
| `FailClose` | FailClose specifies that the proxy should drop the request when the Endpoint Picker fails.
|
-#### ExtensionReference
-
-
-
-ExtensionReference is a reference to the extension.
-
-Connections to this extension MUST use TLS by default. Implementations MAY
-provide a way to customize this connection to use cleartext, a different
-protocol, or custom TLS configuration.
-
-If a reference is invalid, the implementation MUST update the `ResolvedRefs`
-Condition on the InferencePool's status to `status: False`. A 5XX status code
-MUST be returned for the request that would have otherwise been routed to the
-invalid backend.
-
-
-
-_Appears in:_
-- [Extension](#extension)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
-| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent.
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
|
-| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
|
-| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
|
-
-
#### Group
_Underlying type:_ _string_
@@ -165,17 +81,16 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
- [ParentGatewayReference](#parentgatewayreference)
- [PoolObjectReference](#poolobjectreference)
-#### InferenceModel
+#### InferenceObjective
-InferenceModel is the Schema for the InferenceModels API.
+InferenceObjective is the Schema for the InferenceObjectives API.
@@ -184,21 +99,21 @@ InferenceModel is the Schema for the InferenceModels API.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha2` | | |
-| `kind` _string_ | `InferenceModel` | | |
+| `kind` _string_ | `InferenceObjective` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
-| `spec` _[InferenceModelSpec](#inferencemodelspec)_ | | | |
-| `status` _[InferenceModelStatus](#inferencemodelstatus)_ | | | |
+| `spec` _[InferenceObjectiveSpec](#inferenceobjectivespec)_ | | | |
+| `status` _[InferenceObjectiveStatus](#inferenceobjectivestatus)_ | | | |
-#### InferenceModelSpec
+#### InferenceObjectiveSpec
-InferenceModelSpec represents the desired state of a specific model use case. This resource is
+InferenceObjectiveSpec represents the desired state of a specific model use case. This resource is
managed by the "Inference Workload Owner" persona.
The Inference Workload Owner persona is someone that trains, verifies, and
@@ -206,41 +121,39 @@ leverages a large language model from a model frontend, drives the lifecycle
and rollout of new versions of those models, and defines the specific
performance and latency goals for the model. These workloads are
expected to operate within an InferencePool sharing compute capacity with other
-InferenceModels, defined by the Inference Platform Admin.
+InferenceObjectives, defined by the Inference Platform Admin.
-InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
+InferenceObjective's modelName (not the ObjectMeta name) is unique for a given InferencePool,
if the name is reused, an error will be shown on the status of a
-InferenceModel that attempted to reuse. The oldest InferenceModel, based on
+InferenceObjective that attempted to reuse. The oldest InferenceObjective, based on
creation timestamp, will be selected to remain valid. In the event of a race
condition, one will be selected at random.
_Appears in:_
-- [InferenceModel](#inferencemodel)
+- [InferenceObjective](#inferenceobjective)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `modelName` _string_ | ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
ModelNames must be unique for a referencing InferencePool
(names can be reused for a different pool in the same cluster).
The modelName with the oldest creation timestamp is retained, and the incoming
InferenceModel's Ready status is set to false with a corresponding reason.
In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
Names can be reserved without an underlying model configured in the pool.
This can be done by specifying a target model and setting the weight to zero,
an error will be returned specifying that no valid target model is found. | | MaxLength: 256
Required: \{\}
|
-| `criticality` _[Criticality](#criticality)_ | Criticality defines how important it is to serve the model compared to other models referencing the same pool.
Criticality impacts how traffic is handled in resource constrained situations. It handles this by
queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
and the proportionality of fairness will be configurable.
Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
Any implementations that may consume this field may treat an unset value as the 'Standard' range. | | Enum: [Critical Standard Sheddable]
|
-| `targetModels` _[TargetModel](#targetmodel) array_ | TargetModels allow multiple versions of a model for traffic splitting.
If not specified, the target model name is defaulted to the modelName parameter.
modelName is often in reference to a LoRA adapter. | | MaxItems: 10
|
+| `priority` _integer_ | Priority defines how important it is to serve the request compared to other requests in the same pool.
Priority is an integer value that defines the priority of the request.
The higher the value, the more critical the request is; negative values _are_ allowed.
No default value is set for this field, allowing for future additions of new fields that may 'one of' with this field.
However, implementations that consume this field (such as the Endpoint Picker) will treat an unset value as '0'.
Priority is used in flow control, primarily in the event of resource scarcity(reqeusts need to be queued).
All requests will be queued, and flow control will _always_ allow requests of higher priority to be served first.
Fairness is only enforced and tracked between requests of the same priority.
Example: requests with Priority 10 will always be served before
requests with Priority of 0 (the value used if Priority is unset or no InfereneceObjective is specified).
Similarly requests with a Priority of -10 will always be served after requests with Priority of 0. | | |
| `poolRef` _[PoolObjectReference](#poolobjectreference)_ | PoolRef is a reference to the inference pool, the pool must exist in the same namespace. | | Required: \{\}
|
-#### InferenceModelStatus
+#### InferenceObjectiveStatus
-InferenceModelStatus defines the observed state of InferenceModel
+InferenceObjectiveStatus defines the observed state of InferenceObjective
_Appears in:_
-- [InferenceModel](#inferencemodel)
+- [InferenceObjective](#inferenceobjective)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferenceModel.
Known condition types are:
* "Accepted" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Ready]] | MaxItems: 8
|
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferenceObjective.
Known condition types are:
* "Accepted" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Ready]] | MaxItems: 8
|
#### InferencePool
@@ -253,7 +166,6 @@ InferencePool is the Schema for the InferencePools API.
-
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha2` | | |
@@ -282,7 +194,7 @@ _Appears in:_
| --- | --- | --- | --- |
| `selector` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | Selector defines a map of labels to watch model server Pods
that should be included in the InferencePool.
In some cases, implementations may translate this field to a Service selector, so this matches the simple
map used for Service selectors instead of the full Kubernetes LabelSelector type.
If specified, it will be applied to match the model server pods in the same namespace as the InferencePool.
Cross namesoace selector is not supported. | | Required: \{\}
|
| `targetPortNumber` _integer_ | TargetPortNumber defines the port number to access the selected model server Pods.
The number must be in the range 1 to 65535. | | Maximum: 65535
Minimum: 1
Required: \{\}
|
-| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
|
+| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | |
#### InferencePoolStatus
@@ -323,7 +235,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
- [ParentGatewayReference](#parentgatewayreference)
- [PoolObjectReference](#poolobjectreference)
@@ -432,7 +343,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
- [ParentGatewayReference](#parentgatewayreference)
- [PoolObjectReference](#poolobjectreference)
@@ -468,11 +378,11 @@ referrer.
_Appears in:_
-- [InferenceModelSpec](#inferencemodelspec)
+- [InferenceObjectiveSpec](#inferenceobjectivespec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `group` _[Group](#group)_ | Group is the group of the referent. | inference.networking.x-k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
+| `group` _[Group](#group)_ | Group is the group of the referent. | inference.networking.k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
|
| `kind` _[Kind](#kind)_ | Kind is kind of the referent. For example "InferencePool". | InferencePool | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
|
| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
|
@@ -506,30 +416,6 @@ _Validation:_
_Appears in:_
- [Extension](#extension)
-- [ExtensionReference](#extensionreference)
-
-
-
-#### TargetModel
-
-TargetModel represents a deployed model or a LoRA adapter. The
-Name field is expected to match the name of the LoRA adapter
-(or base model) as it is registered within the model server. Inference
-Gateway assumes that the model exists on the model server and it's the
-responsibility of the user to validate a correct match. Should a model fail
-to exist at request time, the error is processed by the Inference Gateway
-and emitted on the appropriate InferenceModel object.
-
-
-
-_Appears in:_
-- [InferenceModelSpec](#inferencemodelspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name is the name of the adapter or base model, as expected by the ModelServer. | | MaxLength: 253
Required: \{\}
|
-| `weight` _integer_ | Weight is used to determine the proportion of traffic that should be
sent to this model when multiple target models are specified.
Weight defines the proportion of requests forwarded to the specified
model. This is computed as weight/(sum of all weights in this
TargetModels list). For non-zero values, there may be some epsilon from
the exact proportion defined here depending on the precision an
implementation supports. Weight is not a percentage and the sum of
weights does not need to equal 100.
If a weight is set for any targetModel, it must be set for all targetModels.
Conversely weights are optional, so long as ALL targetModels do not specify a weight. | | Maximum: 1e+06
Minimum: 1
|
-