@@ -70,7 +70,7 @@ func NewScheduler() *Scheduler {
7070 }
7171 s .nodeManager = newNodeManager ()
7272 s .podManager = newPodManager ()
73- klog .V ( 2 ). InfoS ("Scheduler initialized successfully" )
73+ klog .InfoS ("Scheduler initialized successfully" )
7474 return s
7575}
7676
@@ -87,7 +87,7 @@ func (s *Scheduler) onAddPod(obj any) {
8787 klog .ErrorS (fmt .Errorf ("invalid pod object" ), "Failed to process pod addition" )
8888 return
8989 }
90- klog .V (5 ).InfoS ("Pod added" , "pod" , pod .Name , "namespace" , pod .Namespace )
90+ klog .V (4 ).InfoS ("Pod added" , "pod" , pod .Name , "namespace" , pod .Namespace )
9191 nodeID , ok := pod .Annotations [util .AssignedNodeAnnotations ]
9292 if ! ok {
9393 return
@@ -107,7 +107,7 @@ func (s *Scheduler) onUpdatePod(_, newObj any) {
107107func (s * Scheduler ) onDelPod (obj any ) {
108108 pod , ok := obj .(* corev1.Pod )
109109 if ! ok {
110- klog .Errorf ("unknown add object type" )
110+ klog .ErrorS ( fmt . Errorf ("unknown object type" ), "Failed to process pod deletion " )
111111 return
112112 }
113113 _ , ok = pod .Annotations [util .AssignedNodeAnnotations ]
@@ -156,9 +156,9 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
156156 for {
157157 select {
158158 case <- s .nodeNotify :
159- klog .V (5 ).InfoS ("Received node notification" )
159+ klog .V (4 ).InfoS ("Received node notification" )
160160 case <- ticker .C :
161- klog .InfoS ("Ticker triggered" )
161+ klog .V ( 5 ). InfoS ("Ticker triggered" )
162162 case <- s .stopCh :
163163 klog .InfoS ("Received stop signal, exiting RegisterFromNodeAnnotations" )
164164 return
@@ -168,26 +168,36 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
168168 klog .ErrorS (err , "Failed to list nodes with selector" , "selector" , labelSelector .String ())
169169 continue
170170 }
171- klog .V (5 ).InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
171+
172+ klog .V (4 ).InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
172173 var nodeNames []string
173174 for _ , val := range rawNodes {
174175 nodeNames = append (nodeNames , val .Name )
175- klog .V (5 ).InfoS ("Processing node" , "nodeName" , val .Name )
176+
177+ klog .V (4 ).InfoS ("Processing node" , "nodeName" , val .Name )
176178
177179 for devhandsk , devInstance := range device .GetDevices () {
178- klog .V (5 ).InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
180+
181+ klog .V (4 ).InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
179182
180183 nodedevices , err := devInstance .GetNodeDevices (* val )
181184 if err != nil {
182- klog .V (5 ).InfoS ("Failed to get node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
183- continue
185+ klog .V (3 ).InfoS ("Failed to update node device status" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "error" , err )
184186 }
185187
186188 health , needUpdate := devInstance .CheckHealth (devhandsk , val )
187- klog .V (5 ).InfoS ("Device health check result" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "health" , health , "needUpdate" , needUpdate )
189+
190+ klog .V (4 ).InfoS ("Device health check result" ,
191+ "nodeName" , val .Name ,
192+ "deviceVendor" , devhandsk ,
193+ "health" , health ,
194+ "needUpdate" , needUpdate )
188195
189196 if ! health {
190- klog .Warning ("Device is unhealthy, cleaning up node" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
197+
198+ klog .Warning ("Device is unhealthy, cleaning up node" ,
199+ "nodeName" , val .Name ,
200+ "deviceVendor" , devhandsk )
191201 err := devInstance .NodeCleanUp (val .Name )
192202 if err != nil {
193203 klog .ErrorS (err , "Node cleanup failed" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
@@ -197,40 +207,48 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
197207 continue
198208 }
199209 if ! needUpdate {
200- klog .V (5 ).InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
210+ klog .V (4 ).InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
201211 continue
202212 }
203213 _ , ok := util .HandshakeAnnos [devhandsk ]
204214 if ok {
205215 tmppat := make (map [string ]string )
206216 tmppat [util.HandshakeAnnos [devhandsk ]] = "Requesting_" + time .Now ().Format (time .DateTime )
207- klog .InfoS ("New timestamp for annotation" , "nodeName" , val .Name , "annotationKey" , util .HandshakeAnnos [devhandsk ], "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
217+ klog .V (3 ).InfoS ("New timestamp for annotation" ,
218+ "nodeName" , val .Name ,
219+ "annotationKey" , util .HandshakeAnnos [devhandsk ],
220+ "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
208221 n , err := util .GetNode (val .Name )
209222 if err != nil {
210223 klog .ErrorS (err , "Failed to get node" , "nodeName" , val .Name )
211224 continue
212225 }
213- klog .V (5 ).InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
226+ klog .V (4 ).InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
214227 if err := util .PatchNodeAnnotations (n , tmppat ); err != nil {
215228 klog .ErrorS (err , "Failed to patch node annotations" , "nodeName" , val .Name )
216229 }
217230 }
218231 nodeInfo := & util.NodeInfo {}
219232 nodeInfo .ID = val .Name
220233 nodeInfo .Node = val
221- klog .V (5 ).InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
234+ klog .V (4 ).InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
222235 nodeInfo .Devices = make ([]util.DeviceInfo , 0 )
223236 for _ , deviceinfo := range nodedevices {
224237 nodeInfo .Devices = append (nodeInfo .Devices , * deviceinfo )
225238 }
226239 s .addNode (val .Name , nodeInfo )
227240 if s .nodes [val .Name ] != nil && len (nodeInfo .Devices ) > 0 {
228- if printedLog [val .Name ] {
229- klog .V (5 ).InfoS ("Node device updated" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
230- } else {
231- klog .InfoS ("Node device added" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
241+ action := "updated"
242+ if ! printedLog [val .Name ] {
243+ action = "added"
232244 printedLog [val .Name ] = true
233245 }
246+
247+ klog .InfoS ("Node device inventory changed" ,
248+ "nodeName" , val .Name ,
249+ "deviceVendor" , devhandsk ,
250+ "deviceCount" , len (nodeInfo .Devices ),
251+ "action" , action )
234252 }
235253 }
236254 }
@@ -319,7 +337,10 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
319337 d .Device .Usedcores += udevice .Usedcores
320338 if strings .Contains (udevice .UUID , "[" ) {
321339 if strings .Compare (d .Device .Mode , "hami-core" ) == 0 {
322- klog .Errorf ("found a mig task running on a hami-core GPU\n " )
340+ klog .ErrorS (fmt .Errorf ("invalid configuration" ), "MIG task assigned to non-MIG GPU" ,
341+ "deviceID" , d .Device .ID ,
342+ "deviceMode" , d .Device .Mode ,
343+ "taskUUID" , udevice .UUID )
323344 d .Device .Health = false
324345 continue
325346 }
@@ -328,21 +349,30 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
328349 util .PlatternMIG (& d .Device .MigUsage , d .Device .MigTemplate , tmpIdx )
329350 }
330351 d .Device .MigUsage .UsageList [Instance ].InUse = true
331- klog .V (5 ).Infoln ("add mig usage" , d .Device .MigUsage , "template=" , d .Device .MigTemplate , "uuid=" , d .Device .ID )
352+ klog .V (5 ).InfoS ("MIG device allocated" ,
353+ "deviceID" , d .Device .ID ,
354+ "instanceID" , Instance ,
355+ "template" , d .Device .MigTemplate )
332356 }
333357 }
334358 }
335359 }
336360 }
337361 }
338- klog .V (5 ).Infof ("usage: pod %v assigned %v %v" , p .Name , p .NodeID , p .Devices )
362+ klog .V (4 ).InfoS ("Pod resource assignment" ,
363+ "podName" , p .Name ,
364+ "nodeName" , p .NodeID ,
365+ "deviceCount" , len (p .Devices ))
339366 }
340367 s .overviewstatus = overallnodeMap
341368 for _ , nodeID := range * nodes {
342369 node , err := s .GetNode (nodeID )
343370 if err != nil {
344- // The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority.
345- klog .V (5 ).InfoS ("node unregistered" , "node" , nodeID , "error" , err )
371+ // 注意:使用V(3)而不是InfoS,这样只有在debug模式下才输出
372+ // 这些节点可能是正常的非GPU节点,不应该污染正常日志
373+ klog .V (3 ).InfoS ("Node unregistered or no GPU devices found" ,
374+ "nodeName" , nodeID ,
375+ "error" , err )
346376 failedNodes [nodeID ] = "node unregistered"
347377 continue
348378 }
@@ -396,7 +426,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
396426 klog .ErrorS (err , "Failed to get pod" , "pod" , args .PodName , "namespace" , args .PodNamespace )
397427 return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
398428 }
399- klog .InfoS ("Trying to get the target node for pod " , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
429+ klog .V ( 3 ). InfoS ("Retrieving target node for binding " , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
400430 node , err := s .kubeClient .CoreV1 ().Nodes ().Get (context .Background (), args .Node , metav1.GetOptions {})
401431 if err != nil {
402432 klog .ErrorS (err , "Failed to get node" , "node" , args .Node )
@@ -413,14 +443,14 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
413443 for _ , val := range device .GetDevices () {
414444 err = val .LockNode (node , current )
415445 if err != nil {
416- klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "device " , val )
446+ klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "error " , err )
417447 goto ReleaseNodeLocks
418448 }
419449 }
420450
421451 err = util .PatchPodAnnotations (current , tmppatch )
422452 if err != nil {
423- klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ))
453+ klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ), "annotations" , tmppatch )
424454 return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
425455 }
426456
@@ -453,8 +483,10 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
453483 }
454484 }
455485 if total == 0 {
456- klog .V (1 ).InfoS ("Pod does not request any resources" ,
457- "pod" , args .Pod .Name )
486+ klog .InfoS ("Pod does not request any resources" ,
487+ "podName" , args .Pod .Name ,
488+ "podNamespace" , args .Pod .Namespace ,
489+ "podUID" , args .Pod .UID )
458490 s .recordScheduleFilterResultEvent (args .Pod , EventReasonFilteringFailed , "" , fmt .Errorf ("does not request any resource" ))
459491 return & extenderv1.ExtenderFilterResult {
460492 NodeNames : args .NodeNames ,
@@ -470,8 +502,9 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
470502 return nil , err
471503 }
472504 if len (failedNodes ) != 0 {
473- klog .V (5 ).InfoS ("Nodes failed during usage retrieval" ,
474- "nodes" , failedNodes )
505+ klog .InfoS ("Failed to retrieve usage for some nodes" ,
506+ "failedNodeCount" , len (failedNodes ),
507+ "failedNodes" , failedNodes )
475508 }
476509 nodeScores , err := s .calcScore (nodeUsage , nums , annos , args .Pod , failedNodes )
477510 if err != nil {
@@ -480,14 +513,18 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
480513 return nil , err
481514 }
482515 if len ((* nodeScores ).NodeList ) == 0 {
483- klog .V (4 ).InfoS ("No available nodes meet the required scores" ,
484- "pod" , args .Pod .Name )
516+ klog .InfoS ("No available nodes meet the required scores" ,
517+ "podName" , args .Pod .Name ,
518+ "podNamespace" , args .Pod .Namespace ,
519+ "requestedResources" , k8sutil .Resourcereqs (args .Pod ),
520+ "totalNodesChecked" , len (* args .NodeNames ),
521+ "failedNodesCount" , len (failedNodes ))
485522 s .recordScheduleFilterResultEvent (args .Pod , EventReasonFilteringFailed , "" , fmt .Errorf ("no available node, %d nodes do not meet" , len (* args .NodeNames )))
486523 return & extenderv1.ExtenderFilterResult {
487524 FailedNodes : failedNodes ,
488525 }, nil
489526 }
490- klog .V ( 4 ). Infoln ( "nodeScores_len= " , len (( * nodeScores ) .NodeList ))
527+ klog .InfoS ( "Calculated node scores" , "pod" , args . Pod . Name , "nodeScoresLen " , len (nodeScores .NodeList ))
491528 sort .Sort (nodeScores )
492529 m := (* nodeScores ).NodeList [len ((* nodeScores ).NodeList )- 1 ]
493530 klog .InfoS ("Scheduling pod to node" ,
0 commit comments