@@ -70,7 +70,7 @@ func NewScheduler() *Scheduler {
7070 }
7171 s .nodeManager = newNodeManager ()
7272 s .podManager = newPodManager ()
73- klog .V ( 2 ). InfoS ("Scheduler initialized successfully" )
73+ klog .InfoS ("Scheduler initialized successfully" )
7474 return s
7575}
7676
@@ -87,7 +87,7 @@ func (s *Scheduler) onAddPod(obj any) {
8787 klog .ErrorS (fmt .Errorf ("invalid pod object" ), "Failed to process pod addition" )
8888 return
8989 }
90- klog .V (5 ).InfoS ("Pod added" , "pod" , pod .Name , "namespace" , pod .Namespace )
90+ klog .V (4 ).InfoS ("Pod added" , "pod" , pod .Name , "namespace" , pod .Namespace )
9191 nodeID , ok := pod .Annotations [util .AssignedNodeAnnotations ]
9292 if ! ok {
9393 return
@@ -107,7 +107,7 @@ func (s *Scheduler) onUpdatePod(_, newObj any) {
107107func (s * Scheduler ) onDelPod (obj any ) {
108108 pod , ok := obj .(* corev1.Pod )
109109 if ! ok {
110- klog .Errorf ("unknown add object type" )
110+ klog .ErrorS ( fmt . Errorf ("unknown object type" ), "Failed to process pod deletion " )
111111 return
112112 }
113113 _ , ok = pod .Annotations [util .AssignedNodeAnnotations ]
@@ -156,9 +156,9 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
156156 for {
157157 select {
158158 case <- s .nodeNotify :
159- klog .V (5 ).InfoS ("Received node notification" )
159+ klog .V (4 ).InfoS ("Received node notification" )
160160 case <- ticker .C :
161- klog .InfoS ("Ticker triggered" )
161+ klog .V ( 5 ). InfoS ("Ticker triggered" )
162162 case <- s .stopCh :
163163 klog .InfoS ("Received stop signal, exiting RegisterFromNodeAnnotations" )
164164 return
@@ -168,26 +168,36 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
168168 klog .ErrorS (err , "Failed to list nodes with selector" , "selector" , labelSelector .String ())
169169 continue
170170 }
171- klog .V (5 ).InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
171+
172+ klog .V (4 ).InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
172173 var nodeNames []string
173174 for _ , val := range rawNodes {
174175 nodeNames = append (nodeNames , val .Name )
175- klog .V (5 ).InfoS ("Processing node" , "nodeName" , val .Name )
176+
177+ klog .V (4 ).InfoS ("Processing node" , "nodeName" , val .Name )
176178
177179 for devhandsk , devInstance := range device .GetDevices () {
178- klog .V (5 ).InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
180+
181+ klog .V (4 ).InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
179182
180183 nodedevices , err := devInstance .GetNodeDevices (* val )
181184 if err != nil {
182- klog .V (5 ).InfoS ("Failed to get node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
183- continue
185+ klog .V (3 ).InfoS ("Failed to update node device status" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "error" , err )
184186 }
185187
186188 health , needUpdate := devInstance .CheckHealth (devhandsk , val )
187- klog .V (5 ).InfoS ("Device health check result" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "health" , health , "needUpdate" , needUpdate )
189+
190+ klog .V (4 ).InfoS ("Device health check result" ,
191+ "nodeName" , val .Name ,
192+ "deviceVendor" , devhandsk ,
193+ "health" , health ,
194+ "needUpdate" , needUpdate )
188195
189196 if ! health {
190- klog .Warning ("Device is unhealthy, cleaning up node" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
197+
198+ klog .Warning ("Device is unhealthy, cleaning up node" ,
199+ "nodeName" , val .Name ,
200+ "deviceVendor" , devhandsk )
191201 err := devInstance .NodeCleanUp (val .Name )
192202 if err != nil {
193203 klog .ErrorS (err , "Node cleanup failed" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
@@ -197,40 +207,47 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
197207 continue
198208 }
199209 if ! needUpdate {
200- klog .V (5 ).InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
210+ klog .V (4 ).InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
201211 continue
202212 }
203213 _ , ok := util .HandshakeAnnos [devhandsk ]
204214 if ok {
205215 tmppat := make (map [string ]string )
206216 tmppat [util.HandshakeAnnos [devhandsk ]] = "Requesting_" + time .Now ().Format (time .DateTime )
207- klog .InfoS ("New timestamp for annotation" , "nodeName" , val .Name , "annotationKey" , util .HandshakeAnnos [devhandsk ], "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
217+ klog .V (3 ).InfoS ("New timestamp for annotation" ,
218+ "nodeName" , val .Name ,
219+ "annotationKey" , util .HandshakeAnnos [devhandsk ],
220+ "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
208221 n , err := util .GetNode (val .Name )
209222 if err != nil {
210223 klog .ErrorS (err , "Failed to get node" , "nodeName" , val .Name )
211224 continue
212225 }
213- klog .V (5 ).InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
226+ klog .V (4 ).InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
214227 if err := util .PatchNodeAnnotations (n , tmppat ); err != nil {
215228 klog .ErrorS (err , "Failed to patch node annotations" , "nodeName" , val .Name )
216229 }
217230 }
218231 nodeInfo := & util.NodeInfo {}
219232 nodeInfo .ID = val .Name
220233 nodeInfo .Node = val
221- klog .V (5 ).InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
234+ klog .V (4 ).InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
222235 nodeInfo .Devices = make ([]util.DeviceInfo , 0 )
223236 for _ , deviceinfo := range nodedevices {
224237 nodeInfo .Devices = append (nodeInfo .Devices , * deviceinfo )
225238 }
226239 s .addNode (val .Name , nodeInfo )
227240 if s .nodes [val .Name ] != nil && len (nodeInfo .Devices ) > 0 {
228241 if printedLog [val .Name ] {
229- klog .V ( 5 ). InfoS ("Node device updated" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
242+ klog .InfoS ("Node device updated" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
230243 } else {
231244 klog .InfoS ("Node device added" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
232245 printedLog [val .Name ] = true
233246 }
247+ klog .InfoS ("Node device inventory changed" ,
248+ "nodeName" , val .Name ,
249+ "deviceVendor" , devhandsk ,
250+ "deviceCount" , len (nodeInfo .Devices ))
234251 }
235252 }
236253 }
@@ -319,7 +336,10 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
319336 d .Device .Usedcores += udevice .Usedcores
320337 if strings .Contains (udevice .UUID , "[" ) {
321338 if strings .Compare (d .Device .Mode , "hami-core" ) == 0 {
322- klog .Errorf ("found a mig task running on a hami-core GPU\n " )
339+ klog .ErrorS (fmt .Errorf ("invalid configuration" ), "MIG task assigned to non-MIG GPU" ,
340+ "deviceID" , d .Device .ID ,
341+ "deviceMode" , d .Device .Mode ,
342+ "taskUUID" , udevice .UUID )
323343 d .Device .Health = false
324344 continue
325345 }
@@ -328,21 +348,28 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
328348 util .PlatternMIG (& d .Device .MigUsage , d .Device .MigTemplate , tmpIdx )
329349 }
330350 d .Device .MigUsage .UsageList [Instance ].InUse = true
331- klog .V (5 ).Infoln ("add mig usage" , d .Device .MigUsage , "template=" , d .Device .MigTemplate , "uuid=" , d .Device .ID )
351+ klog .V (4 ).InfoS ("MIG device allocated" ,
352+ "deviceID" , d .Device .ID ,
353+ "instanceID" , Instance ,
354+ "template" , d .Device .MigTemplate )
332355 }
333356 }
334357 }
335358 }
336359 }
337360 }
338- klog .V (5 ).Infof ("usage: pod %v assigned %v %v" , p .Name , p .NodeID , p .Devices )
361+ klog .V (4 ).InfoS ("Pod resource assignment" ,
362+ "podName" , p .Name ,
363+ "nodeName" , p .NodeID ,
364+ "deviceCount" , len (p .Devices ))
339365 }
340366 s .overviewstatus = overallnodeMap
341367 for _ , nodeID := range * nodes {
342368 node , err := s .GetNode (nodeID )
343369 if err != nil {
344- // The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority.
345- klog .V (5 ).InfoS ("node unregistered" , "node" , nodeID , "error" , err )
370+ klog .V (3 ).InfoS ("Node unregistered or no GPU devices found" ,
371+ "nodeName" , nodeID ,
372+ "error" , err )
346373 failedNodes [nodeID ] = "node unregistered"
347374 continue
348375 }
@@ -396,7 +423,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
396423 klog .ErrorS (err , "Failed to get pod" , "pod" , args .PodName , "namespace" , args .PodNamespace )
397424 return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
398425 }
399- klog .InfoS ("Trying to get the target node for pod " , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
426+ klog .V ( 3 ). InfoS ("Retrieving target node for binding " , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
400427 node , err := s .kubeClient .CoreV1 ().Nodes ().Get (context .Background (), args .Node , metav1.GetOptions {})
401428 if err != nil {
402429 klog .ErrorS (err , "Failed to get node" , "node" , args .Node )
@@ -413,14 +440,14 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
413440 for _ , val := range device .GetDevices () {
414441 err = val .LockNode (node , current )
415442 if err != nil {
416- klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "device " , val )
443+ klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "error " , err )
417444 goto ReleaseNodeLocks
418445 }
419446 }
420447
421448 err = util .PatchPodAnnotations (current , tmppatch )
422449 if err != nil {
423- klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ))
450+ klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ), "annotations" , tmppatch )
424451 return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
425452 }
426453
@@ -453,8 +480,10 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
453480 }
454481 }
455482 if total == 0 {
456- klog .V (1 ).InfoS ("Pod does not request any resources" ,
457- "pod" , args .Pod .Name )
483+ klog .InfoS ("Pod does not request any resources" ,
484+ "podName" , args .Pod .Name ,
485+ "podNamespace" , args .Pod .Namespace ,
486+ "podUID" , args .Pod .UID )
458487 s .recordScheduleFilterResultEvent (args .Pod , EventReasonFilteringFailed , "" , fmt .Errorf ("does not request any resource" ))
459488 return & extenderv1.ExtenderFilterResult {
460489 NodeNames : args .NodeNames ,
@@ -470,8 +499,9 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
470499 return nil , err
471500 }
472501 if len (failedNodes ) != 0 {
473- klog .V (5 ).InfoS ("Nodes failed during usage retrieval" ,
474- "nodes" , failedNodes )
502+ klog .InfoS ("Failed to retrieve usage for some nodes" ,
503+ "failedNodeCount" , len (failedNodes ),
504+ "failedNodes" , failedNodes )
475505 }
476506 nodeScores , err := s .calcScore (nodeUsage , nums , annos , args .Pod , failedNodes )
477507 if err != nil {
@@ -480,14 +510,18 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
480510 return nil , err
481511 }
482512 if len ((* nodeScores ).NodeList ) == 0 {
483- klog .V (4 ).InfoS ("No available nodes meet the required scores" ,
484- "pod" , args .Pod .Name )
513+ klog .InfoS ("No available nodes meet the required scores" ,
514+ "podName" , args .Pod .Name ,
515+ "podNamespace" , args .Pod .Namespace ,
516+ "requestedResources" , k8sutil .Resourcereqs (args .Pod ),
517+ "totalNodesChecked" , len (* args .NodeNames ),
518+ "failedNodesCount" , len (failedNodes ))
485519 s .recordScheduleFilterResultEvent (args .Pod , EventReasonFilteringFailed , "" , fmt .Errorf ("no available node, %d nodes do not meet" , len (* args .NodeNames )))
486520 return & extenderv1.ExtenderFilterResult {
487521 FailedNodes : failedNodes ,
488522 }, nil
489523 }
490- klog .V ( 4 ). Infoln ( "nodeScores_len= " , len (( * nodeScores ) .NodeList ))
524+ klog .InfoS ( "Calculated node scores" , "pod" , args . Pod . Name , "nodeScoresLen " , len (nodeScores .NodeList ))
491525 sort .Sort (nodeScores )
492526 m := (* nodeScores ).NodeList [len ((* nodeScores ).NodeList )- 1 ]
493527 klog .InfoS ("Scheduling pod to node" ,
0 commit comments