Skip to content

Commit 856af6a

Browse files
authored
Update 002-api-proposal/ to reflect api/v1alpha2 inferencePool and InferenceModel (#870)
* Update docs about InferencePool * Update docs about InferenceModel
1 parent 440ca87 commit 856af6a

File tree

1 file changed

+133
-36
lines changed

1 file changed

+133
-36
lines changed

docs/proposals/002-api-proposal/README.md

Lines changed: 133 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,94 @@ type InferencePool struct {
122122
metav1.ObjectMeta
123123
metav1.TypeMeta
124124

125-
Spec InferencePoolSpec
125+
Spec InferencePoolSpec
126+
Status InferencePoolStatus
126127
}
127128

128129
type InferencePoolSpec struct {
129-
// ModelServerSelector uses label selection to watch model server pods
130+
// Selector defines a map of labels to watch model server pods
130131
// that should be included in the InferencePool.
131-
ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"`
132+
// In some cases, implementations may translate this field to a Service selector, so this matches the simple
133+
// map used for Service selectors instead of the full Kubernetes LabelSelector type.
134+
// If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool.
135+
// Cross namesoace selector is not supported.
136+
Selector map[LabelKey]LabelValue `json:"selector"`
137+
138+
// TargetPortNumber defines the port number to access the selected model servers.
139+
// The number must be in the range 1 to 65535.
140+
TargetPortNumber int32 `json:"targetPortNumber"`
141+
142+
// EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint
143+
// picker service that picks endpoints for the requests routed to this pool.
144+
EndpointPickerConfig `json:",inline"`
145+
}
146+
147+
// EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension.
148+
// This type is intended to be a union of mutually exclusive configuration options that we may add in the future.
149+
type EndpointPickerConfig struct {
150+
// Extension configures an endpoint picker as an extension service.
151+
ExtensionRef *Extension `json:"extensionRef,omitempty"`
152+
}
153+
154+
// Extension specifies how to configure an extension that runs the endpoint picker.
155+
type Extension struct {
156+
// Reference is a reference to a service extension.
157+
ExtensionReference `json:",inline"`
158+
159+
// ExtensionConnection configures the connection between the gateway and the extension.
160+
ExtensionConnection `json:",inline"`
161+
}
162+
163+
// ExtensionReference is a reference to the extension deployment.
164+
type ExtensionReference struct {
165+
// Group is the group of the referent.
166+
// The default value is "", representing the Core API group.
167+
Group *Group `json:"group,omitempty"`
168+
169+
// Kind is the Kubernetes resource kind of the referent. For example
170+
// "Service".
171+
//
172+
// Defaults to "Service" when not specified.
173+
//
174+
// ExternalName services can refer to CNAME DNS records that may live
175+
// outside of the cluster and as such are difficult to reason about in
176+
// terms of conformance. They also may not be safe to forward to (see
177+
// CVE-2021-25740 for more information). Implementations MUST NOT
178+
// support ExternalName Services.
179+
Kind *Kind `json:"kind,omitempty"`
180+
181+
// Name is the name of the referent.
182+
Name ObjectName `json:"name"`
183+
184+
// The port number on the service running the extension. When unspecified,
185+
// implementations SHOULD infer a default value of 9002 when the Kind is
186+
// Service.
187+
PortNumber *PortNumber `json:"portNumber,omitempty"`
188+
}
189+
190+
// ExtensionConnection encapsulates options that configures the connection to the extension.
191+
type ExtensionConnection struct {
192+
// Configures how the gateway handles the case when the extension is not responsive.
193+
// Defaults to failClose.
194+
FailureMode *ExtensionFailureMode `json:"failureMode"`
195+
}
196+
197+
// ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not
198+
type ExtensionFailureMode string
199+
200+
201+
// PoolStatus defines the observed state of InferencePool from a Gateway.
202+
type PoolStatus struct {
203+
// GatewayRef indicates the gateway that observed state of InferencePool.
204+
GatewayRef corev1.ObjectReference `json:"parentRef"`
205+
206+
// Conditions track the state of the InferencePool.
207+
//
208+
// Known condition types are:
209+
//
210+
// * "Accepted"
211+
// * "ResolvedRefs"
212+
Conditions []metav1.Condition `json:"conditions,omitempty"`
132213
}
133214
```
134215

@@ -147,6 +228,7 @@ type InferenceModel struct {
147228
metav1.TypeMeta
148229

149230
Spec InferenceModelSpec
231+
Status InferenceModelStatus
150232
}
151233

152234
type InferenceModelSpec struct {
@@ -172,22 +254,39 @@ type InferenceModelSpec struct {
172254
// If not specified, the target model name is defaulted to the ModelName parameter.
173255
// ModelName is often in reference to a LoRA adapter.
174256
TargetModels []TargetModel
175-
// Reference to the InferencePool that the model registers to. It must exist in the same namespace.
176-
PoolReference *LocalObjectReference
257+
// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
258+
PoolRef PoolObjectReference
259+
}
260+
261+
// PoolObjectReference identifies an API object within the namespace of the
262+
// referrer.
263+
type PoolObjectReference struct {
264+
// Group is the group of the referent.
265+
Group Group
266+
267+
// Kind is kind of the referent. For example "InferencePool".
268+
Kind Kind
269+
270+
// Name is the name of the referent.
271+
Name ObjectName
177272
}
178273

179274
// Defines how important it is to serve the model compared to other models.
180275
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field should ALWAYS be optional(use a pointer), and set no default.
181276
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
182277
type Criticality string
183278
const (
184-
// Most important. Requests to this band will be shed last.
185-
Critical Criticality = "Critical"
186-
// More important than Sheddable, less important than Critical.
187-
// Requests in this band will be shed before critical traffic.
188-
Default Criticality = "Default"
189-
// Least important. Requests to this band will be shed before all other bands.
190-
Sheddable Criticality = "Sheddable"
279+
// Critical defines the highest level of criticality. Requests to this band will be shed last.
280+
Critical Criticality = "Critical"
281+
282+
// Standard defines the base criticality level and is more important than Sheddable but less
283+
// important than Critical. Requests in this band will be shed before critical traffic.
284+
// Most models are expected to fall within this band.
285+
Standard Criticality = "Standard"
286+
287+
// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
288+
// all other bands.
289+
Sheddable Criticality = "Sheddable"
191290
)
192291

193292
// TargetModel represents a deployed model or a LoRA adapter. The
@@ -200,64 +299,62 @@ const (
200299
type TargetModel struct {
201300
// The name of the adapter as expected by the ModelServer.
202301
Name string
203-
// Weight is used to determine the percentage of traffic that should be
302+
// Weight is used to determine the percentage of traffic that should be
204303
// sent to this target model when multiple versions of the model are specified.
205-
Weight *int
304+
Weight *int32
206305
}
207306

208-
// LocalObjectReference identifies an API object within the namespace of the
209-
// referrer.
210-
type LocalObjectReference struct {
211-
// Group is the group of the referent.
212-
Group Group
213-
214-
// Kind is kind of the referent. For example "InferencePool".
215-
Kind Kind
216-
217-
// Name is the name of the referent.
218-
Name ObjectName
307+
// InferenceModelStatus defines the observed state of InferenceModel
308+
type InferenceModelStatus struct {
309+
// Conditions track the state of the InferenceModel.
310+
Conditions []metav1.Condition
219311
}
220-
221312
```
222313

223314
### Yaml Examples
224315

225316
#### InferencePool(s)
226317
Here we create a pool that selects the appropriate pods
227318
```yaml
228-
apiVersion: inference.x-k8s.io/v1alpha1
319+
apiVersion: inference.x-k8s.io/v1alpha2
229320
kind: InferencePool
230321
metadata:
231322
name: base-model-pool
232-
modelServerSelector:
233-
- app: llm-server
323+
spec:
324+
selector:
325+
app: llm-server
326+
targetNumber: 8080
327+
extensionRef:
328+
name: infra-backend-v1-app
234329
```
235330
236331
#### InferenceModel
237332
238333
Here we consume the pool with two InferenceModels. Where `sql-code-assist` is both the name of the model and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified criticality. Both `sql-code-assist` and `npc-bot` have available LoRA adapters on the InferencePool and routing to each InferencePool happens earlier (at the K8s Gateway).
239334
```yaml
240-
apiVersion: inference.x-k8s.io/v1alpha1
335+
apiVersion: inference.x-k8s.io/v1alpha2
241336
kind: InferenceModel
242337
metadata:
243338
name: sql-code-assist
244339
spec:
245340
modelName: sql-code-assist
246-
poolRef: base-model-pool
341+
poolRef:
342+
name: base-model-pool
247343
---
248-
apiVersion: inference.x-k8s.io/v1alpha1
344+
apiVersion: inference.x-k8s.io/v1alpha2
249345
kind: InferenceModel
250346
metadata:
251347
name: npc-bot
252348
spec:
253349
modelName: npc-bot
254350
criticality: Critical
255351
targetModels:
256-
targetModelName: npc-bot-v1
352+
- name: npc-bot-v1
353+
weight: 50
354+
- name: npc-bot-v2
257355
weight: 50
258-
targetModelName: npc-bot-v2
259-
weight: 50
260-
poolRef: base-model-pool
356+
poolRef:
357+
name: base-model-pool
261358
```
262359

263360

0 commit comments

Comments
 (0)