@@ -20,12 +20,32 @@ import (
2020 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2121)
2222
23- // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
23+ // InferenceModel is the Schema for the InferenceModels API.
24+ //
25+ // +kubebuilder:object:root=true
26+ // +kubebuilder:subresource:status
27+ // +genclient
28+ type InferenceModel struct {
29+ metav1.TypeMeta `json:",inline"`
30+ metav1.ObjectMeta `json:"metadata,omitempty"`
31+
32+ Spec InferenceModelSpec `json:"spec,omitempty"`
33+ Status InferenceModelStatus `json:"status,omitempty"`
34+ }
35+
36+ // InferenceModelList contains a list of InferenceModel.
37+ //
38+ // +kubebuilder:object:root=true
39+ type InferenceModelList struct {
40+ metav1.TypeMeta `json:",inline"`
41+ metav1.ListMeta `json:"metadata,omitempty"`
42+ Items []InferenceModel `json:"items"`
43+ }
2444
25- // InferenceModelSpec represents a specific model use case. This resource is
45+ // InferenceModelSpec represents the desired state of a specific model use case. This resource is
2646// managed by the "Inference Workload Owner" persona.
2747//
28- // The Inference Workload Owner persona is: a team that trains, verifies, and
48+ // The Inference Workload Owner persona is someone that trains, verifies, and
2949// leverages a large language model from a model frontend, drives the lifecycle
3050// and rollout of new versions of those models, and defines the specific
3151// performance and latency goals for the model. These workloads are
@@ -38,7 +58,7 @@ import (
3858// creation timestamp, will be selected to remain valid. In the event of a race
3959// condition, one will be selected at random.
4060type InferenceModelSpec struct {
41- // The name of the model as the users set in the "model" parameter in the requests.
61+ // ModelName is the name of the model as the users set in the "model" parameter in the requests.
4262 // The name should be unique among the workloads that reference the same backend pool.
4363 // This is the parameter that will be used to match the request with. In the future, we may
4464 // allow to match on other request parameters. The other approach to support matching
@@ -47,22 +67,25 @@ type InferenceModelSpec struct {
4767 // This can be done by specifying a target model and setting the weight to zero,
4868 // an error will be returned specifying that no valid target model is found.
4969 //
50- // +optional
5170 // +kubebuilder:validation:MaxLength=253
52- ModelName string `json:"modelName,omitempty"`
53- // Defines how important it is to serve the model compared to other models referencing the same pool.
71+ // +kubebuilder:validation:Required
72+ ModelName string `json:"modelName"`
73+
74+ // Criticality defines how important it is to serve the model compared to other models referencing the same pool.
5475 //
5576 // +optional
5677 // +kubebuilder:default="Default"
5778 Criticality * Criticality `json:"criticality,omitempty"`
58- // Allow multiple versions of a model for traffic splitting.
79+
80+ // TargetModels allow multiple versions of a model for traffic splitting.
5981 // If not specified, the target model name is defaulted to the modelName parameter.
6082 // modelName is often in reference to a LoRA adapter.
6183 //
6284 // +optional
6385 // +kubebuilder:validation:MaxItems=10
6486 TargetModels []TargetModel `json:"targetModels,omitempty"`
65- // Reference to the inference pool, the pool must exist in the same namespace.
87+
88+ // PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
6689 //
6790 // +kubebuilder:validation:Required
6891 PoolRef PoolObjectReference `json:"poolRef"`
@@ -93,39 +116,54 @@ type PoolObjectReference struct {
93116 // +kubebuilder:validation:MinLength=1
94117 // +kubebuilder:validation:MaxLength=253
95118 // +kubebuilder:validation:Required
96- Name string `json:"name,omitempty "`
119+ Name string `json:"name"`
97120}
98121
99- // Defines how important it is to serve the model compared to other models.
122+ // Criticality defines how important it is to serve the model compared to other models.
100123// +kubebuilder:validation:Enum=Critical;Default;Sheddable
101124type Criticality string
102125
103126const (
104- // Most important . Requests to this band will be shed last.
127+ // Critical defines the highest level of criticality . Requests to this band will be shed last.
105128 Critical Criticality = "Critical"
106- // More important than Sheddable, less important than Critical.
107- // Requests in this band will be shed before critical traffic.
108- // +kubebuilder:default=Default
129+
130+ // Default defines the default criticality level and is more important than Sheddable but less
131+ // important than Critical. Requests in this band will be shed before critical traffic.
109132 Default Criticality = "Default"
110- // Least important. Requests to this band will be shed before all other bands.
133+
134+ // Sheddable defines the lowest level of criticality. Requests to this band will be shed before
135+ // all other bands.
111136 Sheddable Criticality = "Sheddable"
112137)
113138
114139// TargetModel represents a deployed model or a LoRA adapter. The
115140// Name field is expected to match the name of the LoRA adapter
116141// (or base model) as it is registered within the model server. Inference
117- // Gateway assumes that the model exists on the model server and is the
142+ // Gateway assumes that the model exists on the model server and it's the
118143// responsibility of the user to validate a correct match. Should a model fail
119- // to exist at request time, the error is processed by the Instance Gateway,
120- // and then emitted on the appropriate InferenceModel object.
144+ // to exist at request time, the error is processed by the Inference Gateway
145+ // and emitted on the appropriate InferenceModel object.
121146type TargetModel struct {
122- // The name of the adapter as expected by the ModelServer.
147+ // Name is the name of the adapter as expected by the ModelServer.
123148 //
124- // +optional
125149 // +kubebuilder:validation:MaxLength=253
126- Name string `json:"name,omitempty"`
150+ // +kubebuilder:validation:Required
151+ Name string `json:"name"`
152+
127153 // Weight is used to determine the proportion of traffic that should be
128- // sent to this target model when multiple versions of the model are specified.
154+ // sent to this model when multiple target models are specified.
155+ //
156+ // Weight defines the proportion of requests forwarded to the specified
157+ // model. This is computed as weight/(sum of all weights in this
158+ // TargetModels list). For non-zero values, there may be some epsilon from
159+ // the exact proportion defined here depending on the precision an
160+ // implementation supports. Weight is not a percentage and the sum of
161+ // weights does not need to equal 100.
162+ //
163+ // If only one model is specified and it has a weight greater than 0, 100%
164+ // of the traffic is forwarded to that model. If weight is set to 0, no
165+ // traffic should be forwarded for this model. If unspecified, weight
166+ // defaults to 1.
129167 //
130168 // +optional
131169 // +kubebuilder:default=1
@@ -140,28 +178,6 @@ type InferenceModelStatus struct {
140178 Conditions []metav1.Condition `json:"conditions,omitempty"`
141179}
142180
143- // +kubebuilder:object:root=true
144- // +kubebuilder:subresource:status
145- // +genclient
146-
147- // InferenceModel is the Schema for the InferenceModels API
148- type InferenceModel struct {
149- metav1.TypeMeta `json:",inline"`
150- metav1.ObjectMeta `json:"metadata,omitempty"`
151-
152- Spec InferenceModelSpec `json:"spec,omitempty"`
153- Status InferenceModelStatus `json:"status,omitempty"`
154- }
155-
156- // +kubebuilder:object:root=true
157-
158- // InferenceModelList contains a list of InferenceModel
159- type InferenceModelList struct {
160- metav1.TypeMeta `json:",inline"`
161- metav1.ListMeta `json:"metadata,omitempty"`
162- Items []InferenceModel `json:"items"`
163- }
164-
165181func init () {
166182 SchemeBuilder .Register (& InferenceModel {}, & InferenceModelList {})
167183}
0 commit comments