Skip to content

Commit eefc12b

Browse files
refactor: enhance logging and error handling in scheduler functions
Signed-off-by: haitwang-cloud <[email protected]>
1 parent e377967 commit eefc12b

File tree

1 file changed

+72
-35
lines changed

1 file changed

+72
-35
lines changed

pkg/scheduler/scheduler.go

Lines changed: 72 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func NewScheduler() *Scheduler {
7070
}
7171
s.nodeManager = newNodeManager()
7272
s.podManager = newPodManager()
73-
klog.V(2).InfoS("Scheduler initialized successfully")
73+
klog.InfoS("Scheduler initialized successfully")
7474
return s
7575
}
7676

@@ -87,7 +87,7 @@ func (s *Scheduler) onAddPod(obj any) {
8787
klog.ErrorS(fmt.Errorf("invalid pod object"), "Failed to process pod addition")
8888
return
8989
}
90-
klog.V(5).InfoS("Pod added", "pod", pod.Name, "namespace", pod.Namespace)
90+
klog.V(4).InfoS("Pod added", "pod", pod.Name, "namespace", pod.Namespace)
9191
nodeID, ok := pod.Annotations[util.AssignedNodeAnnotations]
9292
if !ok {
9393
return
@@ -107,7 +107,7 @@ func (s *Scheduler) onUpdatePod(_, newObj any) {
107107
func (s *Scheduler) onDelPod(obj any) {
108108
pod, ok := obj.(*corev1.Pod)
109109
if !ok {
110-
klog.Errorf("unknown add object type")
110+
klog.ErrorS(fmt.Errorf("unknown object type"), "Failed to process pod deletion")
111111
return
112112
}
113113
_, ok = pod.Annotations[util.AssignedNodeAnnotations]
@@ -156,9 +156,9 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
156156
for {
157157
select {
158158
case <-s.nodeNotify:
159-
klog.V(5).InfoS("Received node notification")
159+
klog.V(4).InfoS("Received node notification")
160160
case <-ticker.C:
161-
klog.InfoS("Ticker triggered")
161+
klog.V(5).InfoS("Ticker triggered")
162162
case <-s.stopCh:
163163
klog.InfoS("Received stop signal, exiting RegisterFromNodeAnnotations")
164164
return
@@ -168,26 +168,36 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
168168
klog.ErrorS(err, "Failed to list nodes with selector", "selector", labelSelector.String())
169169
continue
170170
}
171-
klog.V(5).InfoS("Listed nodes", "nodeCount", len(rawNodes))
171+
172+
klog.V(4).InfoS("Listed nodes", "nodeCount", len(rawNodes))
172173
var nodeNames []string
173174
for _, val := range rawNodes {
174175
nodeNames = append(nodeNames, val.Name)
175-
klog.V(5).InfoS("Processing node", "nodeName", val.Name)
176+
177+
klog.V(4).InfoS("Processing node", "nodeName", val.Name)
176178

177179
for devhandsk, devInstance := range device.GetDevices() {
178-
klog.V(5).InfoS("Checking device health", "nodeName", val.Name, "deviceVendor", devhandsk)
180+
181+
klog.V(4).InfoS("Checking device health", "nodeName", val.Name, "deviceVendor", devhandsk)
179182

180183
nodedevices, err := devInstance.GetNodeDevices(*val)
181184
if err != nil {
182-
klog.V(5).InfoS("Failed to get node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
183-
continue
185+
klog.V(3).InfoS("Failed to update node device status", "nodeName", val.Name, "deviceVendor", devhandsk, "error", err)
184186
}
185187

186188
health, needUpdate := devInstance.CheckHealth(devhandsk, val)
187-
klog.V(5).InfoS("Device health check result", "nodeName", val.Name, "deviceVendor", devhandsk, "health", health, "needUpdate", needUpdate)
189+
190+
klog.V(4).InfoS("Device health check result",
191+
"nodeName", val.Name,
192+
"deviceVendor", devhandsk,
193+
"health", health,
194+
"needUpdate", needUpdate)
188195

189196
if !health {
190-
klog.Warning("Device is unhealthy, cleaning up node", "nodeName", val.Name, "deviceVendor", devhandsk)
197+
198+
klog.Warning("Device is unhealthy, cleaning up node",
199+
"nodeName", val.Name,
200+
"deviceVendor", devhandsk)
191201
err := devInstance.NodeCleanUp(val.Name)
192202
if err != nil {
193203
klog.ErrorS(err, "Node cleanup failed", "nodeName", val.Name, "deviceVendor", devhandsk)
@@ -197,40 +207,48 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
197207
continue
198208
}
199209
if !needUpdate {
200-
klog.V(5).InfoS("No update needed for device", "nodeName", val.Name, "deviceVendor", devhandsk)
210+
klog.V(4).InfoS("No update needed for device", "nodeName", val.Name, "deviceVendor", devhandsk)
201211
continue
202212
}
203213
_, ok := util.HandshakeAnnos[devhandsk]
204214
if ok {
205215
tmppat := make(map[string]string)
206216
tmppat[util.HandshakeAnnos[devhandsk]] = "Requesting_" + time.Now().Format(time.DateTime)
207-
klog.InfoS("New timestamp for annotation", "nodeName", val.Name, "annotationKey", util.HandshakeAnnos[devhandsk], "annotationValue", tmppat[util.HandshakeAnnos[devhandsk]])
217+
klog.V(3).InfoS("New timestamp for annotation",
218+
"nodeName", val.Name,
219+
"annotationKey", util.HandshakeAnnos[devhandsk],
220+
"annotationValue", tmppat[util.HandshakeAnnos[devhandsk]])
208221
n, err := util.GetNode(val.Name)
209222
if err != nil {
210223
klog.ErrorS(err, "Failed to get node", "nodeName", val.Name)
211224
continue
212225
}
213-
klog.V(5).InfoS("Patching node annotations", "nodeName", val.Name, "annotations", tmppat)
226+
klog.V(4).InfoS("Patching node annotations", "nodeName", val.Name, "annotations", tmppat)
214227
if err := util.PatchNodeAnnotations(n, tmppat); err != nil {
215228
klog.ErrorS(err, "Failed to patch node annotations", "nodeName", val.Name)
216229
}
217230
}
218231
nodeInfo := &util.NodeInfo{}
219232
nodeInfo.ID = val.Name
220233
nodeInfo.Node = val
221-
klog.V(5).InfoS("Fetching node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
234+
klog.V(4).InfoS("Fetching node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
222235
nodeInfo.Devices = make([]util.DeviceInfo, 0)
223236
for _, deviceinfo := range nodedevices {
224237
nodeInfo.Devices = append(nodeInfo.Devices, *deviceinfo)
225238
}
226239
s.addNode(val.Name, nodeInfo)
227240
if s.nodes[val.Name] != nil && len(nodeInfo.Devices) > 0 {
228-
if printedLog[val.Name] {
229-
klog.V(5).InfoS("Node device updated", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
230-
} else {
231-
klog.InfoS("Node device added", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
241+
action := "updated"
242+
if !printedLog[val.Name] {
243+
action = "added"
232244
printedLog[val.Name] = true
233245
}
246+
247+
klog.InfoS("Node device inventory changed",
248+
"nodeName", val.Name,
249+
"deviceVendor", devhandsk,
250+
"deviceCount", len(nodeInfo.Devices),
251+
"action", action)
234252
}
235253
}
236254
}
@@ -319,7 +337,10 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
319337
d.Device.Usedcores += udevice.Usedcores
320338
if strings.Contains(udevice.UUID, "[") {
321339
if strings.Compare(d.Device.Mode, "hami-core") == 0 {
322-
klog.Errorf("found a mig task running on a hami-core GPU\n")
340+
klog.ErrorS(fmt.Errorf("invalid configuration"), "MIG task assigned to non-MIG GPU",
341+
"deviceID", d.Device.ID,
342+
"deviceMode", d.Device.Mode,
343+
"taskUUID", udevice.UUID)
323344
d.Device.Health = false
324345
continue
325346
}
@@ -328,21 +349,30 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
328349
util.PlatternMIG(&d.Device.MigUsage, d.Device.MigTemplate, tmpIdx)
329350
}
330351
d.Device.MigUsage.UsageList[Instance].InUse = true
331-
klog.V(5).Infoln("add mig usage", d.Device.MigUsage, "template=", d.Device.MigTemplate, "uuid=", d.Device.ID)
352+
klog.V(5).InfoS("MIG device allocated",
353+
"deviceID", d.Device.ID,
354+
"instanceID", Instance,
355+
"template", d.Device.MigTemplate)
332356
}
333357
}
334358
}
335359
}
336360
}
337361
}
338-
klog.V(5).Infof("usage: pod %v assigned %v %v", p.Name, p.NodeID, p.Devices)
362+
klog.V(4).InfoS("Pod resource assignment",
363+
"podName", p.Name,
364+
"nodeName", p.NodeID,
365+
"deviceCount", len(p.Devices))
339366
}
340367
s.overviewstatus = overallnodeMap
341368
for _, nodeID := range *nodes {
342369
node, err := s.GetNode(nodeID)
343370
if err != nil {
344-
// The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority.
345-
klog.V(5).InfoS("node unregistered", "node", nodeID, "error", err)
371+
// 注意:使用V(3)而不是InfoS,这样只有在debug模式下才输出
372+
// 这些节点可能是正常的非GPU节点,不应该污染正常日志
373+
klog.V(3).InfoS("Node unregistered or no GPU devices found",
374+
"nodeName", nodeID,
375+
"error", err)
346376
failedNodes[nodeID] = "node unregistered"
347377
continue
348378
}
@@ -396,7 +426,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
396426
klog.ErrorS(err, "Failed to get pod", "pod", args.PodName, "namespace", args.PodNamespace)
397427
return &extenderv1.ExtenderBindingResult{Error: err.Error()}, err
398428
}
399-
klog.InfoS("Trying to get the target node for pod", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node)
429+
klog.V(3).InfoS("Retrieving target node for binding", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node)
400430
node, err := s.kubeClient.CoreV1().Nodes().Get(context.Background(), args.Node, metav1.GetOptions{})
401431
if err != nil {
402432
klog.ErrorS(err, "Failed to get node", "node", args.Node)
@@ -413,14 +443,14 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
413443
for _, val := range device.GetDevices() {
414444
err = val.LockNode(node, current)
415445
if err != nil {
416-
klog.ErrorS(err, "Failed to lock node", "node", args.Node, "device", val)
446+
klog.ErrorS(err, "Failed to lock node", "node", args.Node, "error", err)
417447
goto ReleaseNodeLocks
418448
}
419449
}
420450

421451
err = util.PatchPodAnnotations(current, tmppatch)
422452
if err != nil {
423-
klog.ErrorS(err, "Failed to patch pod annotations", "pod", klog.KObj(current))
453+
klog.ErrorS(err, "Failed to patch pod annotations", "pod", klog.KObj(current), "annotations", tmppatch)
424454
return &extenderv1.ExtenderBindingResult{Error: err.Error()}, err
425455
}
426456

@@ -453,8 +483,10 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
453483
}
454484
}
455485
if total == 0 {
456-
klog.V(1).InfoS("Pod does not request any resources",
457-
"pod", args.Pod.Name)
486+
klog.InfoS("Pod does not request any resources",
487+
"podName", args.Pod.Name,
488+
"podNamespace", args.Pod.Namespace,
489+
"podUID", args.Pod.UID)
458490
s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", fmt.Errorf("does not request any resource"))
459491
return &extenderv1.ExtenderFilterResult{
460492
NodeNames: args.NodeNames,
@@ -470,8 +502,9 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
470502
return nil, err
471503
}
472504
if len(failedNodes) != 0 {
473-
klog.V(5).InfoS("Nodes failed during usage retrieval",
474-
"nodes", failedNodes)
505+
klog.InfoS("Failed to retrieve usage for some nodes",
506+
"failedNodeCount", len(failedNodes),
507+
"failedNodes", failedNodes)
475508
}
476509
nodeScores, err := s.calcScore(nodeUsage, nums, annos, args.Pod, failedNodes)
477510
if err != nil {
@@ -480,14 +513,18 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
480513
return nil, err
481514
}
482515
if len((*nodeScores).NodeList) == 0 {
483-
klog.V(4).InfoS("No available nodes meet the required scores",
484-
"pod", args.Pod.Name)
516+
klog.InfoS("No available nodes meet the required scores",
517+
"podName", args.Pod.Name,
518+
"podNamespace", args.Pod.Namespace,
519+
"requestedResources", k8sutil.Resourcereqs(args.Pod),
520+
"totalNodesChecked", len(*args.NodeNames),
521+
"failedNodesCount", len(failedNodes))
485522
s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", fmt.Errorf("no available node, %d nodes do not meet", len(*args.NodeNames)))
486523
return &extenderv1.ExtenderFilterResult{
487524
FailedNodes: failedNodes,
488525
}, nil
489526
}
490-
klog.V(4).Infoln("nodeScores_len=", len((*nodeScores).NodeList))
527+
klog.InfoS("Calculated node scores", "pod", args.Pod.Name, "nodeScoresLen", len(nodeScores.NodeList))
491528
sort.Sort(nodeScores)
492529
m := (*nodeScores).NodeList[len((*nodeScores).NodeList)-1]
493530
klog.InfoS("Scheduling pod to node",

0 commit comments

Comments
 (0)