@@ -29,6 +29,8 @@ type GPUPoolCompactionReconciler struct {
2929 Recorder record.EventRecorder
3030
3131 Allocator * gpuallocator.GpuAllocator
32+
33+ markDeletionNodes map [string ]struct {}
3234}
3335
3436var defaultCompactionDuration = 1 * time .Minute
@@ -71,9 +73,16 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
7173
7274 for _ , gpu := range gpuStore {
7375 if ! gpu .DeletionTimestamp .IsZero () || gpu .Labels [constants .GpuPoolKey ] != pool .Name ||
74- gpu .Status .UsedBy != tfv1 .UsedByTensorFusion {
76+ gpu .Status .UsedBy != tfv1 .UsedByTensorFusion || len ( gpu . Status . NodeSelector ) == 0 {
7577 continue
7678 }
79+
80+ k8sNodeName := gpu .Status .NodeSelector [constants .KubernetesHostNameLabel ]
81+ if _ , ok := r .markDeletionNodes [k8sNodeName ]; ok {
82+ log .V (4 ).Info ("skip node already marked for deletion when calculation capacity" , "node" , k8sNodeName )
83+ continue
84+ }
85+
7786 availableTFlops , _ := gpu .Status .Available .Tflops .AsInt64 ()
7887 poolAvailableTFlops += availableTFlops
7988 availableVRAM , _ := gpu .Status .Available .Vram .AsInt64 ()
@@ -153,6 +162,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
153162 poolAvailableVRAM -= nodeCapVRAM
154163 poolTotalTFlops -= nodeCapTFlops
155164 poolTotalVRAM -= nodeCapVRAM
165+ r .markDeletionNodes [k8sNodeName ] = struct {}{}
156166
157167 log .Info ("Empty node can be compacted - provision mode" , "node" , gpuNode .Name ,
158168 "availableTFlopsAfterCompact" , poolAvailableTFlops ,
@@ -185,6 +195,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
185195 poolAvailableVRAM -= nodeCapVRAM
186196 poolTotalTFlops -= nodeCapTFlops
187197 poolTotalVRAM -= nodeCapVRAM
198+ r .markDeletionNodes [k8sNodeName ] = struct {}{}
188199
189200 log .Info ("Empty node can be compacted - auto-select mode" , "node" , gpuNode .Name ,
190201 "availableTFlopsAfterCompact" , poolAvailableTFlops ,
@@ -227,7 +238,7 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
227238func (r * GPUPoolCompactionReconciler ) getCompactionDuration (ctx context.Context , config * tfv1.NodeManagerConfig ) time.Duration {
228239 log := log .FromContext (ctx )
229240 if config == nil || config .NodeCompaction == nil || config .NodeCompaction .Period == "" {
230- log .Info ("empty node compaction config, use default value" , "duration" , defaultCompactionDuration )
241+ log .V ( 4 ). Info ("empty node compaction config, use default value" , "duration" , defaultCompactionDuration )
231242 return defaultCompactionDuration
232243 }
233244 duration , err := time .ParseDuration (config .NodeCompaction .Period )
@@ -307,6 +318,7 @@ func (r *GPUPoolCompactionReconciler) Reconcile(ctx context.Context, req ctrl.Re
307318
308319// SetupWithManager sets up the controller with the Manager.
309320func (r * GPUPoolCompactionReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
321+ r .markDeletionNodes = make (map [string ]struct {})
310322 return ctrl .NewControllerManagedBy (mgr ).
311323 Named ("gpupool-compaction" ).
312324 WatchesMetadata (& tfv1.GPUPool {}, & handler.EnqueueRequestForObject {}).
@@ -315,5 +327,5 @@ func (r *GPUPoolCompactionReconciler) SetupWithManager(mgr ctrl.Manager) error {
315327
316328func SetTestModeCompactionPeriod () {
317329 defaultCompactionDuration = 700 * time .Millisecond
318- newNodeProtectionDuration = 1200 * time .Millisecond
330+ newNodeProtectionDuration = 1000 * time .Millisecond
319331}
0 commit comments