Skip to content

Commit fadaeae

Browse files
authored
fix: capacity type prefer spot instance (#58)
* fix: manual compaction * fix: capacity type selection
1 parent 897dbb7 commit fadaeae

17 files changed

+214
-28
lines changed

api/v1/gpunode_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ type GPUNodeStatus struct {
7272
AvailableTFlops resource.Quantity `json:"availableTFlops"`
7373
AvailableVRAM resource.Quantity `json:"availableVRAM"`
7474

75+
// +optional
76+
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
77+
// +optional
78+
VirtualAvailableVRAM *resource.Quantity `json:"virtualAvailableVRAM,omitempty"`
79+
7580
// +optional
7681
HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`
7782

api/v1/gpupool_types.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,11 @@ type GPUPoolStatus struct {
375375
AvailableTFlops resource.Quantity `json:"availableTFlops"`
376376
AvailableVRAM resource.Quantity `json:"availableVRAM"`
377377

378+
// +optional
379+
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
380+
// +optional
381+
VirtualAvailableVRAM *resource.Quantity `json:"virtualAvailableVRAM,omitempty"`
382+
378383
// when updating any component version or config, pool controller will perform rolling update.
379384
// the status will be updated periodically, default to 5s, progress will be 0-100.
380385
// when the progress is 100, the component version or config is fully updated.
@@ -397,6 +402,9 @@ type GPUPoolStatus struct {
397402
// If this field is not empty, scheduler will not schedule new AI workloads and stop scaling-up check.
398403
// TODO not implemented yet
399404
BudgetExceeded string `json:"budgetExceeded,omitempty"`
405+
406+
// +optional
407+
LastCompactionTime *metav1.Time `json:"lastCompactionTime,omitempty"`
400408
}
401409

402410
// +kubebuilder:validation:Enum=Pending;Running;Updating;Destroying;Unknown

api/v1/tensorfusioncluster_funcs.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,9 @@ func (tfc *TensorFusionCluster) RefreshStatus(ownedPools []GPUPool) {
117117
tfc.Status.VirtualTFlops = resource.Quantity{}
118118
tfc.Status.VirtualVRAM = resource.Quantity{}
119119

120+
tfc.Status.VirtualAvailableTFlops = &resource.Quantity{}
121+
tfc.Status.VirtualAvailableVRAM = &resource.Quantity{}
122+
120123
for i, gpuPool := range ownedPools {
121124
if gpuPool.Status.Phase != constants.PhaseRunning {
122125
tfc.Status.NotReadyGPUPools = append(tfc.Status.NotReadyGPUPools, gpuPool.Name)
@@ -132,5 +135,12 @@ func (tfc *TensorFusionCluster) RefreshStatus(ownedPools []GPUPool) {
132135

133136
tfc.Status.VirtualTFlops.Add(gpuPool.Status.VirtualTFlops)
134137
tfc.Status.VirtualVRAM.Add(gpuPool.Status.VirtualVRAM)
138+
139+
if gpuPool.Status.VirtualAvailableTFlops != nil {
140+
tfc.Status.VirtualAvailableTFlops.Add(*gpuPool.Status.VirtualAvailableTFlops)
141+
}
142+
if gpuPool.Status.VirtualAvailableVRAM != nil {
143+
tfc.Status.VirtualAvailableVRAM.Add(*gpuPool.Status.VirtualAvailableVRAM)
144+
}
135145
}
136146
}

api/v1/tensorfusioncluster_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ type TensorFusionClusterStatus struct {
5858
AvailableTFlops resource.Quantity `json:"availableTFlops"`
5959
AvailableVRAM resource.Quantity `json:"availableVRAM"`
6060

61+
// +optional
62+
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
63+
// +optional
64+
VirtualAvailableVRAM *resource.Quantity `json:"virtualAvailableVRAM,omitempty"`
65+
6166
// +optional
6267
ReadyGPUPools []string `json:"readyGPUPools"`
6368

api/v1/zz_generated.deepcopy.go

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,18 @@ spec:
313313
- type: string
314314
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
315315
x-kubernetes-int-or-string: true
316+
virtualAvailableTFlops:
317+
anyOf:
318+
- type: integer
319+
- type: string
320+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
321+
x-kubernetes-int-or-string: true
322+
virtualAvailableVRAM:
323+
anyOf:
324+
- type: integer
325+
- type: string
326+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
327+
x-kubernetes-int-or-string: true
316328
virtualTFlops:
317329
anyOf:
318330
- type: integer

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,9 @@ spec:
866866
- type
867867
type: object
868868
type: array
869+
lastCompactionTime:
870+
format: date-time
871+
type: string
869872
notReadyNodes:
870873
format: int32
871874
type: integer
@@ -909,6 +912,18 @@ spec:
909912
type: string
910913
utilizedVRAMPercent:
911914
type: string
915+
virtualAvailableTFlops:
916+
anyOf:
917+
- type: integer
918+
- type: string
919+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
920+
x-kubernetes-int-or-string: true
921+
virtualAvailableVRAM:
922+
anyOf:
923+
- type: integer
924+
- type: string
925+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
926+
x-kubernetes-int-or-string: true
912927
virtualTFlops:
913928
anyOf:
914929
- type: integer

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,18 @@ spec:
10211021
type: string
10221022
utilizedVRAMPercent:
10231023
type: string
1024+
virtualAvailableTFlops:
1025+
anyOf:
1026+
- type: integer
1027+
- type: string
1028+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
1029+
x-kubernetes-int-or-string: true
1030+
virtualAvailableVRAM:
1031+
anyOf:
1032+
- type: integer
1033+
- type: string
1034+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
1035+
x-kubernetes-int-or-string: true
10241036
virtualTFlops:
10251037
anyOf:
10261038
- type: integer

config/crd/bases/tensor-fusion.ai_gpunodes.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,18 @@ spec:
313313
- type: string
314314
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
315315
x-kubernetes-int-or-string: true
316+
virtualAvailableTFlops:
317+
anyOf:
318+
- type: integer
319+
- type: string
320+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
321+
x-kubernetes-int-or-string: true
322+
virtualAvailableVRAM:
323+
anyOf:
324+
- type: integer
325+
- type: string
326+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
327+
x-kubernetes-int-or-string: true
316328
virtualTFlops:
317329
anyOf:
318330
- type: integer

config/crd/bases/tensor-fusion.ai_gpupools.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,9 @@ spec:
866866
- type
867867
type: object
868868
type: array
869+
lastCompactionTime:
870+
format: date-time
871+
type: string
869872
notReadyNodes:
870873
format: int32
871874
type: integer
@@ -909,6 +912,18 @@ spec:
909912
type: string
910913
utilizedVRAMPercent:
911914
type: string
915+
virtualAvailableTFlops:
916+
anyOf:
917+
- type: integer
918+
- type: string
919+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
920+
x-kubernetes-int-or-string: true
921+
virtualAvailableVRAM:
922+
anyOf:
923+
- type: integer
924+
- type: string
925+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
926+
x-kubernetes-int-or-string: true
912927
virtualTFlops:
913928
anyOf:
914929
- type: integer

0 commit comments

Comments
 (0)