@@ -30,13 +30,16 @@ import (
3030	"github.com/NexusGPU/tensor-fusion/internal/metrics" 
3131	utils "github.com/NexusGPU/tensor-fusion/internal/utils" 
3232	"golang.org/x/time/rate" 
33+ 	corev1 "k8s.io/api/core/v1" 
3334	"k8s.io/apimachinery/pkg/api/equality" 
3435	"k8s.io/apimachinery/pkg/api/errors" 
3536	"k8s.io/apimachinery/pkg/api/resource" 
3637	"k8s.io/apimachinery/pkg/runtime" 
3738	utilerrors "k8s.io/apimachinery/pkg/util/errors" 
3839	"k8s.io/client-go/tools/record" 
40+ 	"k8s.io/client-go/util/retry" 
3941	"k8s.io/client-go/util/workqueue" 
42+ 	schedulingcorev1 "k8s.io/component-helpers/scheduling/corev1" 
4043	ctrl "sigs.k8s.io/controller-runtime" 
4144	"sigs.k8s.io/controller-runtime/pkg/client" 
4245	"sigs.k8s.io/controller-runtime/pkg/controller" 
@@ -83,6 +86,9 @@ type GPUPoolReconciler struct {
8386// and requeue until current time after that, start provisioning loop 
8487var  provisioningInitializationMinTime  =  map [string ]time.Time {}
8588
89+ // When GPU nodeSelector changed, trigger all node update 
90+ var  poolSelectorChangeMap  =  map [string ]string {}
91+ 
8692// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools,verbs=get;list;watch;create;update;patch;delete 
8793// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools/status,verbs=get;update;patch 
8894// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools/finalizers,verbs=update 
@@ -116,6 +122,10 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
116122		return  ctrl.Result {}, nil 
117123	}
118124
125+ 	if  err  :=  r .reconcilePoolSelectorChange (ctx , pool ); err  !=  nil  {
126+ 		return  ctrl.Result {}, err 
127+ 	}
128+ 
119129	if  err  :=  r .reconcilePoolCurrentCapacityAndReadiness (ctx , pool ); err  !=  nil  {
120130		return  ctrl.Result {}, err 
121131	}
@@ -404,6 +414,59 @@ func (r *GPUPoolReconciler) reconcilePoolComponents(ctx context.Context, pool *t
404414	return  ctrlResult , utilerrors .NewAggregate (errs )
405415}
406416
417+ func  (r  * GPUPoolReconciler ) reconcilePoolSelectorChange (ctx  context.Context , pool  * tfv1.GPUPool ) error  {
418+ 	if  pool .Spec .NodeManagerConfig  !=  nil  &&  pool .Spec .NodeManagerConfig .NodeSelector  !=  nil  {
419+ 		hash  :=  utils .GetObjectHash (pool .Spec .NodeManagerConfig .NodeSelector )
420+ 		if  poolSelectorChangeMap [pool .Name ] ==  hash  {
421+ 			return  nil 
422+ 		}
423+ 
424+ 		// hash has changed, or first reconcile, should check all k8s nodes 
425+ 		nodes  :=  & corev1.NodeList {}
426+ 		selectors  :=  utils .GetInitialGPUNodeSelector ()
427+ 		if  err  :=  r .List (ctx , nodes , client.MatchingLabels {selectors [0 ]: selectors [1 ]}); err  !=  nil  {
428+ 			return  err 
429+ 		}
430+ 		for  _ , node  :=  range  nodes .Items  {
431+ 			// skip no label or deleting nodes 
432+ 			if  node .Labels  ==  nil  ||  ! node .DeletionTimestamp .IsZero () {
433+ 				continue 
434+ 			}
435+ 			matches , err  :=  schedulingcorev1 .MatchNodeSelectorTerms (& node , pool .Spec .NodeManagerConfig .NodeSelector )
436+ 			if  err  !=  nil  {
437+ 				return  err 
438+ 			}
439+ 			if  matches  {
440+ 				if  err  :=  UpdateK8SNodeSelectorHash (ctx , r .Client , & node , hash ); err  !=  nil  {
441+ 					return  err 
442+ 				}
443+ 			}
444+ 		}
445+ 		poolSelectorChangeMap [pool .Name ] =  hash 
446+ 		return  nil 
447+ 	}
448+ 	return  nil 
449+ }
450+ 
451+ func  UpdateK8SNodeSelectorHash (ctx  context.Context , k8sClient  client.Client , node  * corev1.Node , hash  string ) error  {
452+ 	// skip nodes that already injected the hash 
453+ 	if  node .Labels [constants .LabelNodeSelectorHash ] ==  hash  {
454+ 		return  nil 
455+ 	}
456+ 	// update label to trigger the GPUNode reconcile 
457+ 	if  err  :=  retry .RetryOnConflict (retry .DefaultBackoff , func () error  {
458+ 		latest  :=  & corev1.Node {}
459+ 		if  err  :=  k8sClient .Get (ctx , client.ObjectKey {Name : node .Name }, latest ); err  !=  nil  {
460+ 			return  err 
461+ 		}
462+ 		latest .Labels [constants .LabelNodeSelectorHash ] =  hash 
463+ 		return  k8sClient .Update (ctx , latest )
464+ 	}); err  !=  nil  {
465+ 		return  err 
466+ 	}
467+ 	return  nil 
468+ }
469+ 
407470func  (r  * GPUPoolReconciler ) cleanUpPool (ctx  context.Context , pool  * tfv1.GPUPool ) (bool , error ) {
408471	log  :=  log .FromContext (ctx )
409472	log .Info ("TensorFusionGPUPool is being deleted" , "name" , pool .Name )
0 commit comments