Skip to content

Commit 9c60a2d

Browse files
authored
Improve reporting of lending limit (#81)
1 parent bfdd2aa commit 9c60a2d

File tree

1 file changed

+30
-9
lines changed

1 file changed

+30
-9
lines changed

tools/cluster-checker/checker.js

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -266,17 +266,20 @@ async function main () {
266266
const client = new Client()
267267

268268
let clusterGPUs = 0 // cluster capacity
269-
const noScheduleGPUs = 0 // no-schedule GPUs
270-
const noExecuteGPUs = 0 // no-execute GPUs
269+
let noScheduleGPUs = 0 // no-schedule GPUs
270+
let noExecuteGPUs = 0 // no-execute GPUs
271271
let usedGPUs = 0 // GPU usage by admitted workloads
272272
let borrowedGPUs = 0 // GPU borrowed from the cohort
273273
let quotaGPUs = 0 // nominal GPU quota (excluding slack queue)
274-
let slackGPUs = 0 // lending limit on slack queue
274+
let limitGPUs = 0 // lending limit on slack queue
275+
let slackGPUs = 0 // nominal GPU quota on slack queue
275276

276277
const config = await client.readOperatorConfig()
277278
const taints = config.autopilot?.resourceTaints?.['nvidia.com/gpu']
278279
const slackQueueName = config.slackQueueName
279280

281+
let newline = false
282+
280283
// compute GPU counts
281284
const nodes = await client.nodes()
282285
for (const node of nodes) {
@@ -288,27 +291,39 @@ async function main () {
288291
for (const taint of taints ?? []) {
289292
if (node.metadata.labels?.[taint.key] === taint.value) {
290293
if (taint.effect === 'NoExecute') {
294+
console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`)
295+
newline = true
291296
node.noExecute = true
292297
} else if (taint.effect === 'NoSchedule') {
298+
console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`)
299+
newline = true
293300
node.noSchedule = true
294301
}
295302
}
296303
}
297304
for (const taint of node.spec.taints ?? []) {
298305
if (taint.effect === 'NoExecute') {
306+
console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`)
307+
newline = true
299308
node.noExecute = true
300309
} else if (taint.effect === 'NoSchedule') {
310+
console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`)
311+
newline = true
301312
node.noSchedule = true
302313
}
303314
}
304315
if (node.noExecute) {
305-
node.noExecuteGPUs += gpus
316+
noExecuteGPUs += gpus
306317
} else if (node.noSchedule) { // no double counting
307-
node.noScheduleGPUs += gpus
318+
noScheduleGPUs += gpus
308319
}
309320
}
310321
}
311322

323+
if (newline) {
324+
console.log()
325+
}
326+
312327
// collect cluster queue metrics
313328
const clusterQueues = await client.clusterQueues()
314329
const queues = {}
@@ -348,7 +363,8 @@ async function main () {
348363
usedGPUs += queue.usage
349364
borrowedGPUs += queue.borrowed
350365
if (clusterQueue.metadata.name === slackQueueName) {
351-
slackGPUs = queue.lendingLimit
366+
slackGPUs = queue.quota
367+
limitGPUs = queue.lendingLimit
352368
// do not include slack queue in table
353369
} else {
354370
quotaGPUs += queue.quota
@@ -368,8 +384,9 @@ async function main () {
368384
console.log(`Schedulable GPU count: = ${pad(clusterGPUs - noExecuteGPUs - noScheduleGPUs, width)}`)
369385
console.log()
370386
console.log(`Nominal GPU quota: ${pad(quotaGPUs, width)}`)
371-
console.log(`Slack GPU quota: + ${pad(slackGPUs, width)}`)
372-
console.log(`Total GPU quota: = ${pad(quotaGPUs + slackGPUs, width)}`)
387+
console.log(`Maximum slack GPU quota: + ${pad(slackGPUs, width)}`)
388+
console.log(`Slack GPU quota adjustment: - ${pad(slackGPUs - limitGPUs, width)}`)
389+
console.log(`Current GPU quota: = ${pad(quotaGPUs + limitGPUs, width)}`)
373390
console.log()
374391
console.log(`GPU usage by admitted workloads: ${pad(usedGPUs, width)}`)
375392
console.log(`Borrowed GPU count: ${pad(borrowedGPUs, width)}`)
@@ -379,8 +396,12 @@ async function main () {
379396
console.log('WARNING: nominal GPU quota is greater than schedulable GPU count')
380397
}
381398

399+
if (quotaGPUs + slackGPUs < clusterGPUs) {
400+
console.log('WARNING: maximum GPU quota is lower than total GPU count')
401+
}
402+
382403
if (quotaGPUs + slackGPUs > clusterGPUs) {
383-
console.log('WARNING: total GPU quota is greater than total GPU count')
404+
console.log('WARNING: maximum GPU quota is greater than total GPU count')
384405
}
385406

386407
// check all accessible namespaces

0 commit comments

Comments
 (0)