Skip to content

Commit 0f532af

Browse files
Code2Lifeclaude
andcommitted
fix: reduce preemption log verbosity to V(4) to avoid log flooding
During preemption, N candidate nodes × M victims triggers massive logging from validatePreemption, FilterWithPreempt, and queueingHint paths. Changed all preemption DBG/validation Info-level logs to V(4) to keep production logs clean while preserving debuggability with -v=4. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f4d2279 commit 0f532af

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

internal/gpuallocator/gpuallocator.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ func (s *GpuAllocator) FilterWithPreempt(
240240
// Use a temporary map to accumulate releases for the same GPU (multiple victims on same GPU)
241241
gpuReleasedMap := make(map[string]*tfv1.GPU)
242242

243-
log.FromContext(s.ctx).Info("[PREEMPT-DBG] FilterWithPreempt: starting to simulate release",
243+
log.FromContext(s.ctx).V(4).Info("[PREEMPT-DBG] FilterWithPreempt: starting to simulate release",
244244
"preemptAllocRequestsCount", len(preemptAllocRequests))
245245

246246
for i, preemptAllocRequest := range preemptAllocRequests {
@@ -285,7 +285,7 @@ func (s *GpuAllocator) FilterWithPreempt(
285285
gpuCopy.Status.Available.Vram.Add(preemptAllocRequest.Request.Vram)
286286

287287
// Log GPU state after simulated release
288-
log.FromContext(s.ctx).Info("[PREEMPT-DBG] Simulated release on GPU",
288+
log.FromContext(s.ctx).V(4).Info("[PREEMPT-DBG] Simulated release on GPU",
289289
"gpu", gpuName,
290290
"node", gpuCopy.Status.NodeSelector[constants.KubernetesHostNameLabel],
291291
"victimWorkload", preemptAllocRequest.WorkloadNameNamespace.Name,
@@ -369,7 +369,7 @@ func (s *GpuAllocator) FilterWithPreempt(
369369
return nil, nil, fmt.Errorf("apply filters: %w", err)
370370
}
371371

372-
log.FromContext(s.ctx).Info("[PREEMPT-DBG] FilterWithPreempt: filter results",
372+
log.FromContext(s.ctx).V(4).Info("[PREEMPT-DBG] FilterWithPreempt: filter results",
373373
"filteredGPUsCount", len(filteredGPUs),
374374
"toFilterGPUsCount", len(toFilterGPUs),
375375
"filterDetailsCount", len(filterDetails))
@@ -1557,7 +1557,7 @@ func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(
15571557
return err
15581558
}
15591559
if len(filteredGPUs) < int(allocReq.Count) {
1560-
log.FromContext(s.ctx).Info("[PREEMPT] not enough GPUs after filter during preempt",
1560+
log.FromContext(s.ctx).V(4).Info("[PREEMPT] not enough GPUs after filter during preempt",
15611561
"node", nodeName,
15621562
"requiredGPUs", allocReq.Count,
15631563
"filteredGPUsCount", len(filteredGPUs),

internal/scheduler/gpuresources/gpuresources.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ func (s *GPUFit) checkNominatedPodsGPUReservation(pod *v1.Pod, nodeName string,
465465
// Check if remaining resources are sufficient
466466
if remainingTflops.Cmp(currentTflopsTotal) < 0 ||
467467
remainingVram.Cmp(currentVramTotal) < 0 {
468-
s.logger.Info("Insufficient GPU resources after reserving for nominated pods",
468+
s.logger.V(4).Info("Insufficient GPU resources after reserving for nominated pods",
469469
"currentPod", pod.Name,
470470
"node", nodeName,
471471
"currentPriority", currentPodPriority,
@@ -783,7 +783,7 @@ func (s *GPUFit) queueingHint(logger klog.Logger, pod *v1.Pod, oldObj, newObj in
783783
if pod.Status.NominatedNodeName != "" && newGPU != nil {
784784
gpuNodeName := newGPU.Status.NodeSelector[constants.KubernetesHostNameLabel]
785785
if gpuNodeName == pod.Status.NominatedNodeName {
786-
logger.Info("GPU CR updated on nominated node, immediately requeue preempting pod",
786+
logger.V(4).Info("GPU CR updated on nominated node, immediately requeue preempting pod",
787787
"pod", klog.KObj(pod),
788788
"nominatedNode", pod.Status.NominatedNodeName,
789789
"gpu", newGPU.Name)
@@ -903,7 +903,7 @@ func (s *GPUFit) validatePreemption(state fwk.CycleState, pod *v1.Pod, nodeInfo
903903
// - Checks quota constraints
904904
err = s.allocator.CheckQuotaAndFilterSingleNodePreempt(nodeName, allocReq, victims)
905905
if err != nil {
906-
s.logger.Info("GPU preemption validation failed",
906+
s.logger.V(4).Info("GPU preemption validation failed",
907907
"pod", pod.Name,
908908
"namespace", pod.Namespace,
909909
"node", nodeName,
@@ -916,7 +916,7 @@ func (s *GPUFit) validatePreemption(state fwk.CycleState, pod *v1.Pod, nodeInfo
916916

917917
// Preemption validated successfully
918918
victimList := victims.UnsortedList()
919-
s.logger.Info("GPU preemption validated successfully",
919+
s.logger.V(4).Info("GPU preemption validated successfully",
920920
"pod", pod.Name,
921921
"namespace", pod.Namespace,
922922
"node", nodeName,

0 commit comments

Comments
 (0)