Improve reporting of lending limit

tardieu · tardieu · commit b271c4e5454e · 2024-10-10T09:02:27.000-04:00
diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js
@@ -266,17 +266,20 @@ async function main () {
     const client = new Client()
 
     let clusterGPUs = 0 // cluster capacity
-    const noScheduleGPUs = 0 // no-schedule GPUs
-    const noExecuteGPUs = 0 // no-execute GPUs
+    let noScheduleGPUs = 0 // no-schedule GPUs
+    let noExecuteGPUs = 0 // no-execute GPUs
     let usedGPUs = 0 // GPU usage by admitted workloads
     let borrowedGPUs = 0 // GPU borrowed from the cohort
     let quotaGPUs = 0 // nominal GPU quota (excluding slack queue)
-    let slackGPUs = 0 // lending limit on slack queue
+    let limitGPUs = 0 // lending limit on slack queue
+    let slackGPUs = 0 // nominal GPU quota on slack queue
 
     const config = await client.readOperatorConfig()
     const taints = config.autopilot?.resourceTaints?.['nvidia.com/gpu']
     const slackQueueName = config.slackQueueName
 
+    let newline = false
+
     // compute GPU counts
     const nodes = await client.nodes()
     for (const node of nodes) {
@@ -288,27 +291,39 @@ async function main () {
         for (const taint of taints ?? []) {
           if (node.metadata.labels?.[taint.key] === taint.value) {
             if (taint.effect === 'NoExecute') {
+              console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`)
+              newline = true
               node.noExecute = true
             } else if (taint.effect === 'NoSchedule') {
+              console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`)
+              newline = true
               node.noSchedule = true
             }
           }
         }
         for (const taint of node.spec.taints ?? []) {
           if (taint.effect === 'NoExecute') {
+            console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`)
+            newline = true
             node.noExecute = true
           } else if (taint.effect === 'NoSchedule') {
+            console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`)
+            newline = true
             node.noSchedule = true
           }
         }
         if (node.noExecute) {
-          node.noExecuteGPUs += gpus
+          noExecuteGPUs += gpus
         } else if (node.noSchedule) { // no double counting
-          node.noScheduleGPUs += gpus
+          noScheduleGPUs += gpus
         }
       }
     }
 
+    if (newline) {
+      console.log()
+    }
+
     // collect cluster queue metrics
     const clusterQueues = await client.clusterQueues()
     const queues = {}
@@ -348,7 +363,8 @@ async function main () {
       usedGPUs += queue.usage
       borrowedGPUs += queue.borrowed
       if (clusterQueue.metadata.name === slackQueueName) {
-        slackGPUs = queue.lendingLimit
+        slackGPUs = queue.quota
+        limitGPUs = queue.lendingLimit
         // do not include slack queue in table
       } else {
         quotaGPUs += queue.quota
@@ -368,8 +384,9 @@ async function main () {
     console.log(`Schedulable GPU count:           = ${pad(clusterGPUs - noExecuteGPUs - noScheduleGPUs, width)}`)
     console.log()
     console.log(`Nominal GPU quota:                 ${pad(quotaGPUs, width)}`)
-    console.log(`Slack GPU quota:                 + ${pad(slackGPUs, width)}`)
-    console.log(`Total GPU quota:                 = ${pad(quotaGPUs + slackGPUs, width)}`)
+    console.log(`Maximum slack GPU quota:         + ${pad(slackGPUs, width)}`)
+    console.log(`Slack GPU quota adjustment:      - ${pad(slackGPUs - limitGPUs, width)}`)
+    console.log(`Current GPU quota:               = ${pad(quotaGPUs + limitGPUs, width)}`)
     console.log()
     console.log(`GPU usage by admitted workloads:   ${pad(usedGPUs, width)}`)
     console.log(`Borrowed GPU count:                ${pad(borrowedGPUs, width)}`)
@@ -379,8 +396,12 @@ async function main () {
       console.log('WARNING: nominal GPU quota is greater than schedulable GPU count')
     }
 
+    if (quotaGPUs + slackGPUs < clusterGPUs) {
+      console.log('WARNING: maximum GPU quota is lower than total GPU count')
+    }
+
     if (quotaGPUs + slackGPUs > clusterGPUs) {
-      console.log('WARNING: total GPU quota is greater than total GPU count')
+      console.log('WARNING: maximum GPU quota is greater than total GPU count')
     }
 
     // check all accessible namespaces