fix: enhance GPU allocation logic and QoS level calculation (#570)

Code2Life · claude · web-flow · commit 55025d4cfb2c · 2026-02-12T15:37:20.000+08:00
* fix: enhance GPU allocation logic and QoS level calculation

- Added nil checks for NodeManagerConfig and NodeSelector in getMatchedPoolName to prevent potential nil pointer dereferences.
- Improved GPU allocation methods in GpuAllocator by ensuring thread safety with appropriate locking mechanisms.
- Enhanced QoS level calculation to ensure that limits and requests are not zero before determining high QoS, aligning with Kubernetes behavior.

* fix: reduce preemption log verbosity to V(4) to avoid log flooding

During preemption, N candidate nodes × M victims triggers massive logging
from validatePreemption, FilterWithPreempt, and queueingHint paths.
Changed all preemption DBG/validation Info-level logs to V(4) to keep
production logs clean while preserving debuggability with -v=4.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix: update todos and fix process restart env not refreshing issue

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go
@@ -35,7 +35,7 @@ type SchedulingConfigTemplateSpec struct {
 	// avoid hot GPU devices and continuously balance the workload
 	// implemented by mark GPU as hot and trigger evict for re-scheduling
 	// The hot GPUs will get lower priority for scheduling
-	// TODO: not implemented yet
+	// Future: implement rebalancer
 	// +optional
 	ReBalancer *ReBalancerConfig `json:"reBalancer,omitempty"`
 
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -194,6 +194,7 @@ spec:
                   avoid hot GPU devices and continuously balance the workload
                   implemented by mark GPU as hot and trigger evict for re-scheduling
                   The hot GPUs will get lower priority for scheduling
+                  Future: implement rebalancer
                 properties:
                   enable:
                     type: boolean
diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -194,6 +194,7 @@ spec:
                   avoid hot GPU devices and continuously balance the workload
                   implemented by mark GPU as hot and trigger evict for re-scheduling
                   The hot GPUs will get lower priority for scheduling
+                  Future: implement rebalancer
                 properties:
                   enable:
                     type: boolean
diff --git a/internal/component/hypervisor.go b/internal/component/hypervisor.go
@@ -98,7 +98,7 @@ func (h *Hypervisor) GetResourcesInfo(r client.Client, ctx context.Context, pool
 		}
 	}
 
-	// TODO: sort by creation time desc, need to adjust test
+	// Sort by creation time (ascending: oldest first for predictable batch order)
 	sort.Sort(GPUNodeByCreationTimestamp(h.nodesToUpdate))
 
 	return total, total - len(h.nodesToUpdate), false, nil
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
@@ -626,7 +626,7 @@ func (r *GPUNodeReconciler) checkDriverProbeJobStatus(job *batchv1.Job, log logr
 }
 
 func (r *GPUNodeReconciler) resolveNodeVendor(_ctx context.Context, _node *tfv1.GPUNode) (string, error) {
-	// TODO: Implement this
+	// Future: detect non-Nvidia GPU vendors (e.g. AMD, Ascend) from node labels or device plugin
 	return constants.AcceleratorVendorNvidia, nil
 }
 
diff --git a/internal/controller/gpupool_compaction_controller.go b/internal/controller/gpupool_compaction_controller.go
@@ -46,7 +46,7 @@ var jobStarted sync.Map
 
 // Strategy #1: check if any empty node can be deleted (must satisfy 'allocatedCapacity + warmUpCapacity <= currentCapacity - toBeDeletedCapacity') -- Done
 
-// TODO: implement other strategies
+// Future: implement other compaction strategies (e.g. load-based, cost-based)
 // Strategy #2: check if whole Pool can be bin-packing into less nodes, check from low-priority to high-priority nodes one by one, if workloads could be moved to other nodes (using a simulated scheduler), evict it and mark cordoned, let scheduler to re-schedule
 
 // Strategy #3: check if any node can be reduced to 1/2 size. for remaining nodes, check if allocated size < 1/2 * total size, if so, check if can buy smaller instance, note that the compaction MUST be GPU level, not node level
diff --git a/internal/controller/gpupool_controller.go b/internal/controller/gpupool_controller.go
@@ -155,7 +155,7 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 		if err != nil {
 			return ctrl.Result{}, err
 		}
-		// Set phase to updating and let GPUNode event trigger the check and update capacity loop, util all nodes are ready
+		// Set phase to updating and let GPUNode event trigger the check and update capacity loop, until all nodes are ready
 		if len(newCreatedNodes) > 0 {
 			pendingGPUNodeStateLock.Lock()
 			for claimName := range newCreatedNodes {
diff --git a/internal/controller/gpupool_node_provision.go b/internal/controller/gpupool_node_provision.go
@@ -152,7 +152,7 @@ func (r *GPUPoolReconciler) reconcilePoolCapacityWithProvisioner(ctx context.Con
 
 	var errList []error
 
-	// lock the pool before next node scaling up loop, add assumed scaling resources util all pending nodeClaims are running
+	// lock the pool before next node scaling up loop, add assumed scaling resources until all pending nodeClaims are running
 	newCreatedNodes := map[string]tfv1.Resource{}
 	for _, nodeClaim := range gpuNodeParams {
 		go func(nodeClaim tfv1.GPUNodeClaimSpec) {
diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go
@@ -70,7 +70,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
 		return ctrl.Result{}, nil
 	}
 
-	// Remove TensorFusion taint if exists (TODO: Remove after version 1.50)
+	// Remove TensorFusion taint if exists (deprecated: backward compatibility for legacy deployments)
 	// Skip taint removal if node is being deleted or evicted
 	if node.DeletionTimestamp.IsZero() {
 		taintRemoved, err := r.removeTensorFusionTaint(ctx, node)
@@ -263,7 +263,7 @@ func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		Complete(r)
 }
 
-// Remove TensorFusion taint if exists (TODO: Remove after version 1.50)
+// Remove TensorFusion taint if exists (deprecated: backward compatibility for legacy deployments)
 func (r *NodeReconciler) removeTensorFusionTaint(ctx context.Context, node *corev1.Node) (bool, error) {
 	taintKey := constants.NodeUsedByTaintKey
 	taintValue := constants.TensorFusionSystemName
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
@@ -155,7 +155,7 @@ func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.
 		return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
 	}
 
-	// when updating, check util they are ready
+	// when updating, check until they are ready
 	// check status, if not ready, requeue after backoff delay, if all components are ready, set as ready
 	if ready, conditions, err := r.checkTFClusterComponentsReady(ctx, tfc); err != nil {
 		return ctrl.Result{}, err
@@ -198,7 +198,7 @@ func (r *TensorFusionClusterReconciler) listOwnedGPUPools(ctx context.Context, t
 }
 
 func (r *TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(_ context.Context, _ *tfv1.TensorFusionCluster) (bool, error) {
-	// TODO: Not implemented yet
+	// Future: implement time series database reconciliation
 	return false, nil
 }
 
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -4,6 +4,7 @@ package gpuallocator
 import (
 	"context"
 	"fmt"
+	"maps"
 	"math"
 	"slices"
 	"sort"
@@ -152,12 +153,11 @@ type GpuAllocator struct {
 	nodeGpuStore    map[string]map[string]*tfv1.GPU
 	poolGpuStore    map[string]map[string]*tfv1.GPU
 	nodeWorkerStore map[string]map[types.NamespacedName]struct{}
-
-	storeMutex    sync.RWMutex
-	allocateMutex sync.Mutex
-	syncInterval  time.Duration
-	cancel        context.CancelFunc
-	ctx           context.Context
+	storeMutex      sync.RWMutex
+	allocateMutex   sync.Mutex // serializes legacy Alloc() calls (CheckQuotaAndFilter + Bind)
+	syncInterval    time.Duration
+	cancel          context.CancelFunc
+	ctx             context.Context
 
 	// Queue for tracking modified GPUs that need to be synced
 	dirtyQueue     map[types.NamespacedName]struct{}
@@ -239,11 +239,24 @@ func (s *GpuAllocator) GetAllocationInfo() (
 	nodeWorkerStore map[string]map[types.NamespacedName]struct{},
 	uniqueAllocation map[string]*tfv1.AllocRequest,
 ) {
+	s.storeMutex.RLock()
+	defer s.storeMutex.RUnlock()
 	return s.gpuStore, s.nodeWorkerStore, s.uniqueAllocation
 }
 
+// GetNodeGpuStore returns a snapshot (shallow copy) of the node-to-GPU map.
+// The caller can safely iterate the returned maps without holding any lock.
+// The *tfv1.GPU pointers are shared with the allocator's internal state.
 func (s *GpuAllocator) GetNodeGpuStore() map[string]map[string]*tfv1.GPU {
-	return s.nodeGpuStore
+	s.storeMutex.RLock()
+	defer s.storeMutex.RUnlock()
+	result := make(map[string]map[string]*tfv1.GPU, len(s.nodeGpuStore))
+	for nodeName, gpuMap := range s.nodeGpuStore {
+		innerCopy := make(map[string]*tfv1.GPU, len(gpuMap))
+		maps.Copy(innerCopy, gpuMap)
+		result[nodeName] = innerCopy
+	}
+	return result
 }
 
 // AllocRequest encapsulates all parameters needed for GPU allocation
@@ -335,7 +348,7 @@ func (s *GpuAllocator) FilterWithPreempt(
 	// Use a temporary map to accumulate releases for the same GPU (multiple victims on same GPU)
 	gpuReleasedMap := make(map[string]*tfv1.GPU)
 
-	log.FromContext(s.ctx).Info("[PREEMPT] FilterWithPreempt: starting to simulate release",
+	log.FromContext(s.ctx).V(4).Info("[PREEMPT-DBG] FilterWithPreempt: starting to simulate release",
 		"preemptAllocRequestsCount", len(preemptAllocRequests))
 
 	for i, preemptAllocRequest := range preemptAllocRequests {
@@ -396,10 +409,8 @@ func (s *GpuAllocator) FilterWithPreempt(
 				gpuCopy.Status.Available.Tflops.Add(reqTflops)
 				gpuCopy.Status.Available.Vram.Add(preemptAllocRequest.Request.Vram)
 			}
-			// toFilterGPUs = append(toFilterGPUs, gpuCopy)
 
-			// includedGPUs[gpuCopy.Name] = true
-			log.FromContext(s.ctx).Info("[PREEMPT-DBG] Simulated release on GPU",
+			log.FromContext(s.ctx).V(4).Info("[PREEMPT-DBG] Simulated release on GPU",
 				"gpu", gpuName,
 				"node", gpuCopy.Status.NodeSelector[constants.KubernetesHostNameLabel],
 				"victimWorkload", preemptAllocRequest.WorkloadNameNamespace.Name,
@@ -489,7 +500,7 @@ func (s *GpuAllocator) FilterWithPreempt(
 		return nil, nil, fmt.Errorf("apply filters: %w", err)
 	}
 
-	log.FromContext(s.ctx).Info("[PREEMPT-DBG] FilterWithPreempt: filter results",
+	log.FromContext(s.ctx).V(4).Info("[PREEMPT-DBG] FilterWithPreempt: filter results",
 		"filteredGPUsCount", len(filteredGPUs),
 		"toFilterGPUsCount", len(toFilterGPUs),
 		"filterDetailsCount", len(filterDetails))
@@ -603,13 +614,13 @@ func (s *GpuAllocator) Bind(
 		return nil, fmt.Errorf("no GPUs provided to bind")
 	}
 
+	s.storeMutex.Lock()
+	defer s.storeMutex.Unlock()
+
 	if _, exists := s.uniqueAllocation[string(req.PodMeta.UID)]; exists {
 		return nil, fmt.Errorf("pod %s has already allocated GPUs", req.PodMeta.UID)
 	}
 
-	s.storeMutex.Lock()
-	defer s.storeMutex.Unlock()
-
 	// Proceed with GPU allocation
 	gpuNodeName := ""
 	for _, selectedGPU := range gpuNames {
@@ -758,6 +769,7 @@ func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocR
 	}
 
 	if s.maxWorkerPerNode > 0 {
+		s.storeMutex.RLock()
 		// First pass: check if any filtering is needed
 		needsFiltering := false
 		for _, gpu := range filteredGPUs {
@@ -779,6 +791,7 @@ func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocR
 			}
 			filteredGPUs = finalFilteredGPUs
 		}
+		s.storeMutex.RUnlock()
 	}
 
 	return filteredGPUs, filterDetails, nil
@@ -815,6 +828,9 @@ func (s *GpuAllocator) Dealloc(
 	podUID := string(podMeta.UID)
 	logger := log.FromContext(s.ctx)
 
+	s.storeMutex.Lock()
+	defer s.storeMutex.Unlock()
+
 	request, exists := s.uniqueAllocation[podUID]
 	if !exists || request == nil {
 		// should not block finalizer
@@ -829,9 +845,6 @@ func (s *GpuAllocator) Dealloc(
 		return
 	}
 
-	s.storeMutex.Lock()
-	defer s.storeMutex.Unlock()
-
 	nodeName := ""
 	for _, gpu := range gpus {
 		// Get the GPU from the store
@@ -896,6 +909,9 @@ func (s *GpuAllocator) Dealloc(
 func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.AdjustRequest, dryRun bool) (tfv1.Resource, tfv1.Resource, error) {
 
 	<-s.initializedCh
+	s.storeMutex.Lock()
+	defer s.storeMutex.Unlock()
+
 	request, exists := s.uniqueAllocation[adjustRequest.PodUID]
 	if !exists || request == nil {
 		return tfv1.Resource{}, tfv1.Resource{}, fmt.Errorf("pod %s has not allocated GPUs", adjustRequest.PodUID)
@@ -947,9 +963,6 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.
 
 	// pre check passed, change GPU request and QuotaStore and markDirty to sync to Kubernetes
 	if !dryRun {
-		s.storeMutex.Lock()
-		defer s.storeMutex.Unlock()
-
 		for _, gpuName := range request.GPUNames {
 			gpuNameNs := types.NamespacedName{Name: gpuName}
 			gpu := s.gpuStore[gpuNameNs]
@@ -988,6 +1001,9 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.
 
 func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] {
 	<-s.initializedCh
+	s.storeMutex.RLock()
+	defer s.storeMutex.RUnlock()
+
 	set := sets.New[string]()
 	for nodeName, podNames := range s.nodeWorkerStore {
 		// If using by TF, the node can not be used by original scheduler
@@ -1001,13 +1017,21 @@ func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] {
 }
 
 func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier types.NamespacedName) {
+	// Read allocation info under RLock to avoid data race on maps
+	s.storeMutex.RLock()
 	podUID := s.podNamespaceNsToPodUID[podIdentifier.String()]
-	if request, exists := s.uniqueAllocation[podUID]; exists {
+	request, exists := s.uniqueAllocation[podUID]
+	s.storeMutex.RUnlock()
+
+	if exists && request != nil {
 		s.Dealloc(request.WorkloadNameNamespace, request.GPUNames, request.PodMeta)
 	}
 }
 
 func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest {
+	s.storeMutex.RLock()
+	defer s.storeMutex.RUnlock()
+
 	allocRequests := make([]*tfv1.AllocRequest, 0, 8)
 	workers, exists := s.nodeWorkerStore[nodeName]
 	if !exists || workers == nil {
@@ -1627,6 +1651,10 @@ func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(
 	nodeName string, allocReq *tfv1.AllocRequest, toPreemptPods sets.Set[types.NamespacedName],
 ) error {
 	<-s.initializedCh
+
+	// Read allocation maps under RLock to avoid data race
+	s.storeMutex.RLock()
+
 	log.FromContext(s.ctx).V(4).Info("[PREEMPT] CheckQuotaAndFilterSingleNodePreempt start",
 		"node", nodeName,
 		"requiredGPUs", allocReq.Count,
@@ -1717,6 +1745,9 @@ func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(
 		log.FromContext(s.ctx).V(5).Info("Preempting node and check quotas", "nodeName", nodeName, "toPreemptUsage", toPreemptUsage)
 	}
 
+	// Release RLock before calling FilterWithPreempt which acquires its own locks
+	s.storeMutex.RUnlock()
+
 	if err := s.quotaStore.CheckTotalQuotaRelaxed(allocReq, toPreemptUsage); err != nil {
 		return fmt.Errorf("quota check failed during preempt: %w", err)
 	}
@@ -1741,7 +1772,7 @@ func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(
 		return err
 	}
 	if len(filteredGPUs) < int(allocReq.Count) {
-		log.FromContext(s.ctx).Info("[PREEMPT] not enough GPUs after filter during preempt",
+		log.FromContext(s.ctx).V(4).Info("[PREEMPT] not enough GPUs after filter during preempt",
 			"node", nodeName,
 			"requiredGPUs", allocReq.Count,
 			"filteredGPUsCount", len(filteredGPUs),
@@ -1770,8 +1801,12 @@ func (s *GpuAllocator) reconcileAllocationState() {
 		deletedAndDeAllocated := !worker.DeletionTimestamp.IsZero() &&
 			!controllerutil.ContainsFinalizer(&worker, constants.Finalizer)
 
-		if scheduled {
-			allocRequest, msg, err := utils.ComposeAllocationRequest(ctx, &worker)
+		// Only register active pods in uniqueAllocation.
+		// Pods that are deletedAndDeAllocated must NOT be registered: their resources
+		// are not counted in Available, so a late Dealloc would add resources back
+		// that were never subtracted, causing Available to exceed correct state.
+		if scheduled && !deletedAndDeAllocated {
+			allocRequest, msg, err := s.ComposeAllocationRequest(&worker)
 			if err != nil {
 				logger.Error(err, "Failed to compose allocation request for existing worker Pod, annotation may not be valid", "pod", worker.Name, "msg", msg)
 				return false
@@ -1910,21 +1945,40 @@ func (s *GpuAllocator) startWorkerCleanUpChecker() {
 	for {
 		select {
 		case <-ticker.C:
-			cleaned := 0
+			// Collect candidates under RLock to avoid data race on map iteration
+			type deallocCandidate struct {
+				workloadNameNs tfv1.NameNamespace
+				gpuNames       []string
+				podMeta        metav1.ObjectMeta
+			}
+			var candidates []deallocCandidate
+
+			s.storeMutex.RLock()
 			for _, allocRequest := range s.uniqueAllocation {
 				if allocRequest.PodMeta.Name == "" {
 					continue
 				}
+				candidates = append(candidates, deallocCandidate{
+					workloadNameNs: allocRequest.WorkloadNameNamespace,
+					gpuNames:       allocRequest.GPUNames,
+					podMeta:        allocRequest.PodMeta,
+				})
+			}
+			totalWorkers := len(s.uniqueAllocation)
+			s.storeMutex.RUnlock()
+
+			cleaned := 0
+			for _, c := range candidates {
 				pod := &v1.Pod{}
-				err := s.Get(s.ctx, types.NamespacedName{Namespace: allocRequest.PodMeta.Namespace, Name: allocRequest.PodMeta.Name}, pod)
+				err := s.Get(s.ctx, types.NamespacedName{Namespace: c.podMeta.Namespace, Name: c.podMeta.Name}, pod)
 				if errors.IsNotFound(err) {
-					log.FromContext(s.ctx).Info("Pod has been deleted, deallocate GPU", "pod", allocRequest.PodMeta.Name, "namespace", allocRequest.PodMeta.Namespace)
-					s.Dealloc(allocRequest.WorkloadNameNamespace, allocRequest.GPUNames, allocRequest.PodMeta)
+					log.FromContext(s.ctx).Info("Pod has been deleted, deallocate GPU", "pod", c.podMeta.Name, "namespace", c.podMeta.Namespace)
+					s.Dealloc(c.workloadNameNs, c.gpuNames, c.podMeta)
 					cleaned++
 				}
 			}
 			log.FromContext(s.ctx).Info("GPU allocation cleaned up check completed", "total workers",
-				len(s.uniqueAllocation), "backup cleaner cleaned", cleaned)
+				totalWorkers, "backup cleaner cleaned", cleaned)
 		case <-s.ctx.Done():
 			return
 		}
diff --git a/internal/scheduler/expander/handler.go b/internal/scheduler/expander/handler.go
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
diff --git a/pkg/hypervisor/backend/single_node/single_node_backend.go b/pkg/hypervisor/backend/single_node/single_node_backend.go

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ func (h *Hypervisor) GetResourcesInfo(r client.Client, ctx context.Context, pool`
`98`	`98`	`}`
`99`	`99`	`}`
`100`	`100`
`101`		`- // TODO: sort by creation time desc, need to adjust test`
	`101`	`+ // Sort by creation time (ascending: oldest first for predictable batch order)`
`102`	`102`	`sort.Sort(GPUNodeByCreationTimestamp(h.nodesToUpdate))`
`103`	`103`
`104`	`104`	`return total, total - len(h.nodesToUpdate), false, nil`
Original file line number	Diff line number	Diff line change
`@@ -626,7 +626,7 @@ func (r GPUNodeReconciler) checkDriverProbeJobStatus(job batchv1.Job, log logr`
`626`	`626`	`}`
`627`	`627`
`628`	`628`	`func (r GPUNodeReconciler) resolveNodeVendor(_ctx context.Context, _node tfv1.GPUNode) (string, error) {`
`629`		`- // TODO: Implement this`
	`629`	`+ // Future: detect non-Nvidia GPU vendors (e.g. AMD, Ascend) from node labels or device plugin`
`630`	`630`	`return constants.AcceleratorVendorNvidia, nil`
`631`	`631`	`}`
`632`	`632`
Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct`
`155`	`155`	`if err != nil {`
`156`	`156`	`return ctrl.Result{}, err`
`157`	`157`	`}`
`158`		`- // Set phase to updating and let GPUNode event trigger the check and update capacity loop, util all nodes are ready`
	`158`	`+ // Set phase to updating and let GPUNode event trigger the check and update capacity loop, until all nodes are ready`
`159`	`159`	`if len(newCreatedNodes) > 0 {`
`160`	`160`	`pendingGPUNodeStateLock.Lock()`
`161`	`161`	`for claimName := range newCreatedNodes {`
Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.`
`155`	`155`	`return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil`
`156`	`156`	`}`
`157`	`157`
`158`		`- // when updating, check util they are ready`
	`158`	`+ // when updating, check until they are ready`
`159`	`159`	`// check status, if not ready, requeue after backoff delay, if all components are ready, set as ready`
`160`	`160`	`if ready, conditions, err := r.checkTFClusterComponentsReady(ctx, tfc); err != nil {`
`161`	`161`	`return ctrl.Result{}, err`
`@@ -198,7 +198,7 @@ func (r *TensorFusionClusterReconciler) listOwnedGPUPools(ctx context.Context, t`
`198`	`198`	`}`
`199`	`199`
`200`	`200`	`func (r TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(_ context.Context, _ tfv1.TensorFusionCluster) (bool, error) {`
`201`		`- // TODO: Not implemented yet`
	`201`	`+ // Future: implement time series database reconciliation`
`202`	`202`	`return false, nil`
`203`	`203`	`}`
`204`	`204`