Skip to content

Commit b9b8225

Browse files
authored
fix: stop reconcilation when gpunode or gpupool is deleted (#214)
* fix: stop reconcilation when gpunode or gpupool is deleted * fix: stop reconcilation when cluster is deleted and linter issue
1 parent 2e3b6c0 commit b9b8225

File tree

4 files changed

+18
-12
lines changed

4 files changed

+18
-12
lines changed

internal/controller/gpunode_controller.go

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23-
"strings"
2423
"time"
2524

2625
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
@@ -124,18 +123,11 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
124123
if err != nil {
125124
return ctrl.Result{}, err
126125
}
127-
if shouldReturn {
126+
if shouldReturn || !node.DeletionTimestamp.IsZero() {
128127
return ctrl.Result{}, nil
129128
}
130129

131-
var poolName string
132-
for labelKey := range node.Labels {
133-
after, ok := strings.CutPrefix(labelKey, constants.GPUNodePoolIdentifierLabelPrefix)
134-
if ok {
135-
poolName = after
136-
break
137-
}
138-
}
130+
poolName := utils.ExtractPoolNameFromNodeLabel(node)
139131
if poolName == "" {
140132
log.Error(nil, "failed to get pool name", "node", node.Name)
141133
return ctrl.Result{}, nil

internal/controller/gpupool_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
9898
if err != nil {
9999
return ctrl.Result{}, err
100100
}
101-
if shouldReturn {
101+
if shouldReturn || !pool.DeletionTimestamp.IsZero() {
102102
// requeue for next loop
103103
// we need manually requeue cause GenerationChangedPredicate
104104
return ctrl.Result{Requeue: true}, nil

internal/controller/tensorfusioncluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.
110110
if err != nil {
111111
return ctrl.Result{}, err
112112
}
113-
if shouldReturn {
113+
if shouldReturn || !tfc.DeletionTimestamp.IsZero() {
114114
// requeue for next loop
115115
// we need manually requeue cause GenerationChangedPredicate
116116
return ctrl.Result{Requeue: true}, nil

internal/utils/reconcile.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ import (
99
"math"
1010
"math/rand/v2"
1111
"os"
12+
"strings"
1213
"sync"
1314
"time"
1415

1516
constants "github.com/NexusGPU/tensor-fusion/internal/constants"
1617
"k8s.io/apimachinery/pkg/types"
1718

19+
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
1820
corev1 "k8s.io/api/core/v1"
1921
"sigs.k8s.io/controller-runtime/pkg/client"
2022
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -184,3 +186,15 @@ func IsPodConditionTrue(conditions []corev1.PodCondition, conditionType corev1.P
184186
func IsPodTerminated(pod *corev1.Pod) bool {
185187
return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded
186188
}
189+
190+
func ExtractPoolNameFromNodeLabel(node *tfv1.GPUNode) string {
191+
var poolName string
192+
for labelKey := range node.Labels {
193+
after, ok := strings.CutPrefix(labelKey, constants.GPUNodePoolIdentifierLabelPrefix)
194+
if ok {
195+
poolName = after
196+
break
197+
}
198+
}
199+
return poolName
200+
}

0 commit comments

Comments
 (0)