Skip to content

Commit 634e1ff

Browse files
authored
feat: support progressive migration, add more print columns, avoid nvidia operator schedule to tensor fusion managed nodes (#273)
* feat: support progressive migration, add more print columns * fix: avoid nvidia operator schedule to tensor fusion managed nodes * fix: optimize webhook performance for non tf pod
1 parent 8b0c01f commit 634e1ff

34 files changed

+465
-31
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"certgen",
2121
"certificaterequests",
2222
"certmanager",
23+
"clientcmd",
2324
"clientgoscheme",
2425
"clientset",
2526
"cloudnative",

api/v1/gpu_types.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,29 @@ type GPUStatus struct {
3535
NodeSelector map[string]string `json:"nodeSelector"`
3636
GPUModel string `json:"gpuModel"`
3737

38+
// GPU is used by tensor-fusion or nvidia-operator
39+
// This is the key to be compatible with nvidia-device-plugin to avoid resource overlap
40+
// Hypervisor will watch kubelet device plugin to report all GPUs already used by nvidia-device-plugin
41+
// GPUs will be grouped by usedBy to be used by different Pods,
42+
// tensor-fusion annotation or nvidia-device-plugin resource block
43+
// +optional
44+
UsedBy UsedBySystem `json:"usedBy,omitempty"`
45+
3846
Message string `json:"message"`
3947

4048
// +optional
4149
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
4250
}
4351

52+
// +kubebuilder:validation:Enum=tensor-fusion;nvidia-device-plugin
53+
// +default="tensor-fusion"
54+
type UsedBySystem string
55+
56+
const (
57+
UsedByTensorFusion UsedBySystem = "tensor-fusion"
58+
UsedByNvidiaDevicePlugin UsedBySystem = "nvidia-device-plugin"
59+
)
60+
4461
type RunningAppDetail struct {
4562
// Workload name namespace
4663
Name string `json:"name,omitempty"`
@@ -72,6 +89,8 @@ const (
7289
// +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.available.tflops"
7390
// +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.available.vram"
7491
// +kubebuilder:printcolumn:name="Device UUID",type="string",JSONPath=".status.uuid"
92+
// +kubebuilder:printcolumn:name="Used By",type="string",JSONPath=".status.usedBy"
93+
// +kubebuilder:printcolumn:name="Node",type="string",JSONPath=".status.nodeSelector"
7594
// GPU is the Schema for the gpus API.
7695
type GPU struct {
7796
metav1.TypeMeta `json:",inline"`

api/v1/schedulingconfigtemplate_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ type SchedulingConfigTemplateStatus struct {
217217
// +kubebuilder:resource:scope=Cluster
218218
// +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".spec.placement.mode"
219219
// +kubebuilder:printcolumn:name="Allow Local GPU",type="string",JSONPath=".spec.placement.allowLocalGPU"
220+
// +kubebuilder:printcolumn:name="AutoFreeze",type="string",JSONPath=".spec.hypervisor.autoFreezeAndResume.autoFreeze.enable"
220221
// SchedulingConfigTemplate is the Schema for the schedulingconfigtemplates API.
221222
type SchedulingConfigTemplate struct {
222223
metav1.TypeMeta `json:",inline"`

api/v1/tensorfusionconnection_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ type TensorFusionConnectionStatus struct {
4646

4747
// +kubebuilder:object:root=true
4848
// +kubebuilder:subresource:status
49+
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
50+
// +kubebuilder:printcolumn:name="Connection URL",type="string",JSONPath=".status.connectionURL"
51+
// +kubebuilder:printcolumn:name="Worker Name",type="string",JSONPath=".status.workerName"
52+
// +kubebuilder:printcolumn:name="Workload Name",type="string",JSONPath=".spec.workloadName"
53+
// +kubebuilder:printcolumn:name="Client Pod",type="string",JSONPath=".spec.clientPod"
4954

5055
// TensorFusionConnection is the Schema for the tensorfusionconnections API.
5156
type TensorFusionConnection struct {

api/v1/tensorfusionworkload_types.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ type TensorFusionWorkloadStatus struct {
6969

7070
// +kubebuilder:object:root=true
7171
// +kubebuilder:subresource:status
72+
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
73+
// +kubebuilder:printcolumn:name="Worker Count",type="string",JSONPath=".status.workerCount"
74+
// +kubebuilder:printcolumn:name="Ready Workers",type="string",JSONPath=".status.readyWorkers"
75+
// +kubebuilder:printcolumn:name="Pod Template Hash",type="string",JSONPath=".status.podTemplateHash"
76+
// +kubebuilder:printcolumn:name="Pool Name",type="string",JSONPath=".spec.poolName"
77+
// +kubebuilder:printcolumn:name="QoS",type="string",JSONPath=".spec.qos"
78+
// +kubebuilder:printcolumn:name="Is Local GPU",type="string",JSONPath=".spec.isLocalGPU"
79+
// +kubebuilder:printcolumn:name="GPU Number",type="string",JSONPath=".spec.gpuCount"
80+
// +kubebuilder:printcolumn:name="Replicas",type="string",JSONPath=".spec.replicas"
7281

7382
// TensorFusionWorkload is the Schema for the tensorfusionworkloads API.
7483
type TensorFusionWorkload struct {

api/v1/workloadprofile_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ type WorkloadProfileStatus struct {
8080

8181
// +kubebuilder:object:root=true
8282
// +kubebuilder:subresource:status
83+
// +kubebuilder:printcolumn:name="Pool Name",type="string",JSONPath=".spec.poolName"
84+
// +kubebuilder:printcolumn:name="QoS",type="string",JSONPath=".spec.qos"
85+
// +kubebuilder:printcolumn:name="Is Local GPU",type="string",JSONPath=".spec.isLocalGPU"
86+
// +kubebuilder:printcolumn:name="GPU Number",type="string",JSONPath=".spec.gpuCount"
87+
// +kubebuilder:printcolumn:name="Replicas",type="string",JSONPath=".spec.replicas"
8388

8489
// WorkloadProfile is the Schema for the workloadprofiles API.
8590
type WorkloadProfile struct {

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.4.5
18+
version: 1.4.6
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.36.3"
24+
appVersion: "1.39.1"

charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ spec:
3636
- jsonPath: .status.uuid
3737
name: Device UUID
3838
type: string
39+
- jsonPath: .status.usedBy
40+
name: Used By
41+
type: string
42+
- jsonPath: .status.nodeSelector
43+
name: Node
44+
type: string
3945
name: v1
4046
schema:
4147
openAPIV3Schema:
@@ -131,6 +137,17 @@ spec:
131137
- count
132138
type: object
133139
type: array
140+
usedBy:
141+
description: |-
142+
GPU is used by tensor-fusion or nvidia-operator
143+
This is the key to be compatible with nvidia-device-plugin to avoid resource overlap
144+
Hypervisor will watch kubelet device plugin to report all GPUs already used by nvidia-device-plugin
145+
GPUs will be grouped by usedBy to be used by different Pods,
146+
tensor-fusion annotation or nvidia-device-plugin resource block
147+
enum:
148+
- tensor-fusion
149+
- nvidia-device-plugin
150+
type: string
134151
uuid:
135152
type: string
136153
required:

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ spec:
2121
- jsonPath: .spec.placement.allowLocalGPU
2222
name: Allow Local GPU
2323
type: string
24+
- jsonPath: .spec.hypervisor.autoFreezeAndResume.autoFreeze.enable
25+
name: AutoFreeze
26+
type: string
2427
name: v1
2528
schema:
2629
openAPIV3Schema:

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionconnections.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,23 @@ spec:
1414
singular: tensorfusionconnection
1515
scope: Namespaced
1616
versions:
17-
- name: v1
17+
- additionalPrinterColumns:
18+
- jsonPath: .status.phase
19+
name: Phase
20+
type: string
21+
- jsonPath: .status.connectionURL
22+
name: Connection URL
23+
type: string
24+
- jsonPath: .status.workerName
25+
name: Worker Name
26+
type: string
27+
- jsonPath: .spec.workloadName
28+
name: Workload Name
29+
type: string
30+
- jsonPath: .spec.clientPod
31+
name: Client Pod
32+
type: string
33+
name: v1
1834
schema:
1935
openAPIV3Schema:
2036
description: TensorFusionConnection is the Schema for the tensorfusionconnections

0 commit comments

Comments
 (0)