Skip to content

Commit 55025d4

Browse files
Code2Lifeclaude
andauthored
fix: enhance GPU allocation logic and QoS level calculation (#570)
* fix: enhance GPU allocation logic and QoS level calculation - Added nil checks for NodeManagerConfig and NodeSelector in getMatchedPoolName to prevent potential nil pointer dereferences. - Improved GPU allocation methods in GpuAllocator by ensuring thread safety with appropriate locking mechanisms. - Enhanced QoS level calculation to ensure that limits and requests are not zero before determining high QoS, aligning with Kubernetes behavior. * fix: reduce preemption log verbosity to V(4) to avoid log flooding During preemption, N candidate nodes × M victims triggers massive logging from validatePreemption, FilterWithPreempt, and queueingHint paths. Changed all preemption DBG/validation Info-level logs to V(4) to keep production logs clean while preserving debuggability with -v=4. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: update todos and fix process restart env not refreshing issue --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 56b93e7 commit 55025d4

File tree

15 files changed

+201
-117
lines changed

15 files changed

+201
-117
lines changed

api/v1/schedulingconfigtemplate_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ type SchedulingConfigTemplateSpec struct {
3535
// avoid hot GPU devices and continuously balance the workload
3636
// implemented by mark GPU as hot and trigger evict for re-scheduling
3737
// The hot GPUs will get lower priority for scheduling
38-
// TODO: not implemented yet
38+
// Future: implement rebalancer
3939
// +optional
4040
ReBalancer *ReBalancerConfig `json:"reBalancer,omitempty"`
4141

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ spec:
194194
avoid hot GPU devices and continuously balance the workload
195195
implemented by mark GPU as hot and trigger evict for re-scheduling
196196
The hot GPUs will get lower priority for scheduling
197+
Future: implement rebalancer
197198
properties:
198199
enable:
199200
type: boolean

config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ spec:
194194
avoid hot GPU devices and continuously balance the workload
195195
implemented by mark GPU as hot and trigger evict for re-scheduling
196196
The hot GPUs will get lower priority for scheduling
197+
Future: implement rebalancer
197198
properties:
198199
enable:
199200
type: boolean

internal/component/hypervisor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func (h *Hypervisor) GetResourcesInfo(r client.Client, ctx context.Context, pool
9898
}
9999
}
100100

101-
// TODO: sort by creation time desc, need to adjust test
101+
// Sort by creation time (ascending: oldest first for predictable batch order)
102102
sort.Sort(GPUNodeByCreationTimestamp(h.nodesToUpdate))
103103

104104
return total, total - len(h.nodesToUpdate), false, nil

internal/controller/gpunode_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ func (r *GPUNodeReconciler) checkDriverProbeJobStatus(job *batchv1.Job, log logr
626626
}
627627

628628
func (r *GPUNodeReconciler) resolveNodeVendor(_ctx context.Context, _node *tfv1.GPUNode) (string, error) {
629-
// TODO: Implement this
629+
// Future: detect non-Nvidia GPU vendors (e.g. AMD, Ascend) from node labels or device plugin
630630
return constants.AcceleratorVendorNvidia, nil
631631
}
632632

internal/controller/gpupool_compaction_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ var jobStarted sync.Map
4646

4747
// Strategy #1: check if any empty node can be deleted (must satisfy 'allocatedCapacity + warmUpCapacity <= currentCapacity - toBeDeletedCapacity') -- Done
4848

49-
// TODO: implement other strategies
49+
// Future: implement other compaction strategies (e.g. load-based, cost-based)
5050
// Strategy #2: check if whole Pool can be bin-packing into less nodes, check from low-priority to high-priority nodes one by one, if workloads could be moved to other nodes (using a simulated scheduler), evict it and mark cordoned, let scheduler to re-schedule
5151

5252
// Strategy #3: check if any node can be reduced to 1/2 size. for remaining nodes, check if allocated size < 1/2 * total size, if so, check if can buy smaller instance, note that the compaction MUST be GPU level, not node level

internal/controller/gpupool_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
155155
if err != nil {
156156
return ctrl.Result{}, err
157157
}
158-
// Set phase to updating and let GPUNode event trigger the check and update capacity loop, util all nodes are ready
158+
// Set phase to updating and let GPUNode event trigger the check and update capacity loop, until all nodes are ready
159159
if len(newCreatedNodes) > 0 {
160160
pendingGPUNodeStateLock.Lock()
161161
for claimName := range newCreatedNodes {

internal/controller/gpupool_node_provision.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ func (r *GPUPoolReconciler) reconcilePoolCapacityWithProvisioner(ctx context.Con
152152

153153
var errList []error
154154

155-
// lock the pool before next node scaling up loop, add assumed scaling resources util all pending nodeClaims are running
155+
// lock the pool before next node scaling up loop, add assumed scaling resources until all pending nodeClaims are running
156156
newCreatedNodes := map[string]tfv1.Resource{}
157157
for _, nodeClaim := range gpuNodeParams {
158158
go func(nodeClaim tfv1.GPUNodeClaimSpec) {

internal/controller/node_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
7070
return ctrl.Result{}, nil
7171
}
7272

73-
// Remove TensorFusion taint if exists (TODO: Remove after version 1.50)
73+
// Remove TensorFusion taint if exists (deprecated: backward compatibility for legacy deployments)
7474
// Skip taint removal if node is being deleted or evicted
7575
if node.DeletionTimestamp.IsZero() {
7676
taintRemoved, err := r.removeTensorFusionTaint(ctx, node)
@@ -263,7 +263,7 @@ func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
263263
Complete(r)
264264
}
265265

266-
// Remove TensorFusion taint if exists (TODO: Remove after version 1.50)
266+
// Remove TensorFusion taint if exists (deprecated: backward compatibility for legacy deployments)
267267
func (r *NodeReconciler) removeTensorFusionTaint(ctx context.Context, node *corev1.Node) (bool, error) {
268268
taintKey := constants.NodeUsedByTaintKey
269269
taintValue := constants.TensorFusionSystemName

internal/controller/tensorfusioncluster_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ func (r *TensorFusionClusterReconciler) Reconcile(ctx context.Context, req ctrl.
155155
return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
156156
}
157157

158-
// when updating, check util they are ready
158+
// when updating, check until they are ready
159159
// check status, if not ready, requeue after backoff delay, if all components are ready, set as ready
160160
if ready, conditions, err := r.checkTFClusterComponentsReady(ctx, tfc); err != nil {
161161
return ctrl.Result{}, err
@@ -198,7 +198,7 @@ func (r *TensorFusionClusterReconciler) listOwnedGPUPools(ctx context.Context, t
198198
}
199199

200200
func (r *TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(_ context.Context, _ *tfv1.TensorFusionCluster) (bool, error) {
201-
// TODO: Not implemented yet
201+
// Future: implement time series database reconciliation
202202
return false, nil
203203
}
204204

0 commit comments

Comments
 (0)