@@ -38,11 +38,9 @@ import (
3838 "k8s.io/client-go/util/retry"
3939 "k8s.io/utils/ptr"
4040 ctrl "sigs.k8s.io/controller-runtime"
41- "sigs.k8s.io/controller-runtime/pkg/builder"
4241 "sigs.k8s.io/controller-runtime/pkg/client"
4342 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4443 "sigs.k8s.io/controller-runtime/pkg/log"
45- "sigs.k8s.io/controller-runtime/pkg/predicate"
4644)
4745
4846// GPUNodeReconciler reconciles a GPUNode object
@@ -74,7 +72,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
7472 }
7573
7674 deleted , err := utils .HandleFinalizer (ctx , node , r .Client , func (ctx context.Context , node * tfv1.GPUNode ) (bool , error ) {
77-
7875 if node .Status .Phase != tfv1 .TensorFusionGPUNodePhaseDestroying {
7976 node .Status .Phase = tfv1 .TensorFusionGPUNodePhaseDestroying
8077 if err := r .Status ().Update (ctx , node ); err != nil {
@@ -136,13 +133,39 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
136133 break
137134 }
138135 }
136+ if poolName == "" {
137+ log .Error (nil , "failed to get pool name" , "node" , node .Name )
138+ return ctrl.Result {}, nil
139+ }
139140
140141 poolObj := & tfv1.GPUPool {}
141142 err = r .Client .Get (ctx , client.ObjectKey {Name : poolName }, poolObj )
142143 if err != nil {
143144 return ctrl.Result {}, fmt .Errorf ("failed to get tensor-fusion pool, can not create node discovery job, pool: %s" , poolName )
144145 }
145146
147+ if node .Spec .ManageMode != tfv1 .GPUNodeManageModeProvisioned {
148+ // Check if the Kubernetes node exists; if not, the GPUNode should delete itself.
149+ if node .Status .KubernetesNodeName != "" {
150+ // Try to get the Kubernetes node
151+ coreNode := & corev1.Node {}
152+ err := r .Get (ctx , client.ObjectKey {Name : node .Status .KubernetesNodeName }, coreNode )
153+ if err != nil {
154+ if errors .IsNotFound (err ) {
155+ // The Kubernetes node does not exist, delete the GPUNode
156+ log .Info ("Kubernetes node does not exist, deleting GPUNode" ,
157+ "kubernetesNodeName" , node .Status .KubernetesNodeName )
158+ if err := r .Delete (ctx , node ); err != nil {
159+ return ctrl.Result {}, fmt .Errorf ("failed to delete GPUNode after Kubernetes node was deleted: %w" , err )
160+ }
161+ // Return early since we've deleted the resource
162+ return ctrl.Result {}, nil
163+ }
164+ return ctrl.Result {}, fmt .Errorf ("failed to get Kubernetes node %s: %w" ,
165+ node .Status .KubernetesNodeName , err )
166+ }
167+ }
168+ }
146169 if err := r .reconcileCloudVendorNode (ctx , node , poolObj ); err != nil {
147170 return ctrl.Result {}, err
148171 }
@@ -151,10 +174,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
151174 if node .Status .KubernetesNodeName == "" {
152175 return ctrl.Result {RequeueAfter : 10 * time .Second }, nil
153176 }
154- if poolName == "" {
155- log .Error (nil , "failed to get pool name" , "node" , node .Name )
156- return ctrl.Result {}, nil
157- }
158177
159178 if err := r .reconcileNodeDiscoveryJob (ctx , node , poolObj ); err != nil {
160179 return ctrl.Result {}, err
@@ -415,7 +434,6 @@ func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tf
415434}
416435
417436func (r * GPUNodeReconciler ) reconcileCloudVendorNode (ctx context.Context , node * tfv1.GPUNode , pool * tfv1.GPUPool ) error {
418-
419437 // Avoid creating duplicated cloud vendor nodes, if not working, keep pending status
420438 if node .Status .NodeInfo .InstanceID != "" {
421439 // node already created, check status
@@ -532,7 +550,7 @@ func (r *GPUNodeReconciler) CalculateVirtualCapacity(node *tfv1.GPUNode, pool *t
532550// SetupWithManager sets up the controller with the Manager.
533551func (r * GPUNodeReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
534552 return ctrl .NewControllerManagedBy (mgr ).
535- For (& tfv1.GPUNode {}, builder . WithPredicates (predicate. GenerationChangedPredicate {}) ).
553+ For (& tfv1.GPUNode {}).
536554 Named ("gpunode" ).
537555 Owns (& corev1.Node {}).
538556 Owns (& batchv1.Job {}).
0 commit comments