@@ -266,17 +266,20 @@ async function main () {
266266 const client = new Client ( )
267267
268268 let clusterGPUs = 0 // cluster capacity
269- const noScheduleGPUs = 0 // no-schedule GPUs
270- const noExecuteGPUs = 0 // no-execute GPUs
269+ let noScheduleGPUs = 0 // no-schedule GPUs
270+ let noExecuteGPUs = 0 // no-execute GPUs
271271 let usedGPUs = 0 // GPU usage by admitted workloads
272272 let borrowedGPUs = 0 // GPU borrowed from the cohort
273273 let quotaGPUs = 0 // nominal GPU quota (excluding slack queue)
274- let slackGPUs = 0 // lending limit on slack queue
274+ let limitGPUs = 0 // lending limit on slack queue
275+ let slackGPUs = 0 // nominal GPU quota on slack queue
275276
276277 const config = await client . readOperatorConfig ( )
277278 const taints = config . autopilot ?. resourceTaints ?. [ 'nvidia.com/gpu' ]
278279 const slackQueueName = config . slackQueueName
279280
281+ let newline = false
282+
280283 // compute GPU counts
281284 const nodes = await client . nodes ( )
282285 for ( const node of nodes ) {
@@ -288,27 +291,39 @@ async function main () {
288291 for ( const taint of taints ?? [ ] ) {
289292 if ( node . metadata . labels ?. [ taint . key ] === taint . value ) {
290293 if ( taint . effect === 'NoExecute' ) {
294+ console . log ( `WARNING: node "${ node . metadata . name } " has label "${ taint . key } "="${ taint . value } " with effect "${ taint . effect } "` )
295+ newline = true
291296 node . noExecute = true
292297 } else if ( taint . effect === 'NoSchedule' ) {
298+ console . log ( `WARNING: node "${ node . metadata . name } " has label "${ taint . key } "="${ taint . value } " with effect "${ taint . effect } "` )
299+ newline = true
293300 node . noSchedule = true
294301 }
295302 }
296303 }
297304 for ( const taint of node . spec . taints ?? [ ] ) {
298305 if ( taint . effect === 'NoExecute' ) {
306+ console . log ( `WARNING: node "${ node . metadata . name } " has taint "${ taint . key } " with effect "${ taint . effect } "` )
307+ newline = true
299308 node . noExecute = true
300309 } else if ( taint . effect === 'NoSchedule' ) {
310+ console . log ( `WARNING: node "${ node . metadata . name } " has taint "${ taint . key } " with effect "${ taint . effect } "` )
311+ newline = true
301312 node . noSchedule = true
302313 }
303314 }
304315 if ( node . noExecute ) {
305- node . noExecuteGPUs += gpus
316+ noExecuteGPUs += gpus
306317 } else if ( node . noSchedule ) { // no double counting
307- node . noScheduleGPUs += gpus
318+ noScheduleGPUs += gpus
308319 }
309320 }
310321 }
311322
323+ if ( newline ) {
324+ console . log ( )
325+ }
326+
312327 // collect cluster queue metrics
313328 const clusterQueues = await client . clusterQueues ( )
314329 const queues = { }
@@ -348,7 +363,8 @@ async function main () {
348363 usedGPUs += queue . usage
349364 borrowedGPUs += queue . borrowed
350365 if ( clusterQueue . metadata . name === slackQueueName ) {
351- slackGPUs = queue . lendingLimit
366+ slackGPUs = queue . quota
367+ limitGPUs = queue . lendingLimit
352368 // do not include slack queue in table
353369 } else {
354370 quotaGPUs += queue . quota
@@ -368,8 +384,9 @@ async function main () {
368384 console . log ( `Schedulable GPU count: = ${ pad ( clusterGPUs - noExecuteGPUs - noScheduleGPUs , width ) } ` )
369385 console . log ( )
370386 console . log ( `Nominal GPU quota: ${ pad ( quotaGPUs , width ) } ` )
371- console . log ( `Slack GPU quota: + ${ pad ( slackGPUs , width ) } ` )
372- console . log ( `Total GPU quota: = ${ pad ( quotaGPUs + slackGPUs , width ) } ` )
387+ console . log ( `Maximum slack GPU quota: + ${ pad ( slackGPUs , width ) } ` )
388+ console . log ( `Slack GPU quota adjustment: - ${ pad ( slackGPUs - limitGPUs , width ) } ` )
389+ console . log ( `Current GPU quota: = ${ pad ( quotaGPUs + limitGPUs , width ) } ` )
373390 console . log ( )
374391 console . log ( `GPU usage by admitted workloads: ${ pad ( usedGPUs , width ) } ` )
375392 console . log ( `Borrowed GPU count: ${ pad ( borrowedGPUs , width ) } ` )
@@ -379,8 +396,12 @@ async function main () {
379396 console . log ( 'WARNING: nominal GPU quota is greater than schedulable GPU count' )
380397 }
381398
399+ if ( quotaGPUs + slackGPUs < clusterGPUs ) {
400+ console . log ( 'WARNING: maximum GPU quota is lower than total GPU count' )
401+ }
402+
382403 if ( quotaGPUs + slackGPUs > clusterGPUs ) {
383- console . log ( 'WARNING: total GPU quota is greater than total GPU count' )
404+ console . log ( 'WARNING: maximum GPU quota is greater than total GPU count' )
384405 }
385406
386407 // check all accessible namespaces
0 commit comments