@@ -169,22 +169,23 @@ type Requirement struct {
169169 Values []string `json:"values,omitempty"`
170170}
171171
172- // +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
172+ // +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/ zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
173173type NodeRequirementKey string
174174
175175const (
176176 NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
177177 NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
178178 NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
179179
180- NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
181- NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
180+ NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
181+ NodeRequirementKeyRegion NodeRequirementKey = "topology.kubernetes.io/region"
182+ NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
182183
183184 // capacity-type is charging method, can be spot/preemptive or on-demand
184185 NodeRequirementKeyCapacityType NodeRequirementKey = "karpenter.sh/capacity-type"
185186
186187 NodeRequirementKeyInstanceFamily NodeRequirementKey = "tensor-fusion.ai/gpu-instance-family"
187- NodeRequirementKeyInstanceSize NodeRequirementKey = "karpenter.k8s.aws /gpu-instance-size"
188+ NodeRequirementKeyInstanceSize NodeRequirementKey = "tensor-fusion.ai /gpu-instance-size"
188189)
189190
190191type Taint struct {
@@ -363,11 +364,6 @@ type GPUPoolStatus struct {
363364 AvailableTFlops resource.Quantity `json:"availableTFlops"`
364365 AvailableVRAM resource.Quantity `json:"availableVRAM"`
365366
366- // If using provisioner, GPU nodes could be outside of the K8S cluster.
367- // The GPUNodes custom resource will be created and deleted automatically.
368- // ProvisioningStatus is to track the status of those outside GPU nodes.
369- ProvisioningStatus PoolProvisioningStatus `json:"provisioningStatus"`
370-
371367 // when updating any component version or config, pool controller will perform rolling update.
372368 // the status will be updated periodically, default to 5s, progress will be 0-100.
373369 // when the progress is 100, the component version or config is fully updated.
@@ -388,6 +384,7 @@ type GPUPoolStatus struct {
388384 // +kubebuilder:default=""
389385 // If the budget is exceeded, the set value in comma separated string to indicate which period caused the exceeding.
390386 // If this field is not empty, scheduler will not schedule new AI workloads and stop scaling-up check.
387+ // TODO not implemented yet
391388 BudgetExceeded string `json:"budgetExceeded,omitempty"`
392389}
393390
@@ -427,6 +424,7 @@ type PoolComponentStatus struct {
427424// +kubebuilder:subresource:status
428425// +kubebuilder:resource:scope=Cluster
429426
427+ // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
430428// +kubebuilder:printcolumn:name="TFlops Oversubscription",type="string",JSONPath=".spec.capacityConfig.oversubscription.tflopsOversellRatio"
431429// +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".status.mode"
432430// +kubebuilder:printcolumn:name="Default Scheduling Strategy",type="string",JSONPath=".spec.schedulingConfigTemplate"
0 commit comments