Skip to content

Commit 16c086e

Browse files
authored
Add warning: total GPU quota is greater than total GPU count (#69)
1 parent 7c9b00b commit 16c086e

File tree

1 file changed

+16
-12
lines changed

1 file changed

+16
-12
lines changed

tools/cluster-checker/checker.js

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ const k8s = require('@kubernetes/client-node')
44
const k8srp = require('kubernetes-resource-parser')
55

66
const nodeResources = {
7-
'nvidia.com/gpu' : 8,
8-
'nvidia.com/roce_gdr' : 2,
9-
'cpu' : 80,
10-
'memory' : '800G'
7+
'nvidia.com/gpu': 8,
8+
'nvidia.com/roce_gdr': 2,
9+
'cpu': 80,
10+
'memory': '800G'
1111
}
1212

1313
class Client {
@@ -39,11 +39,11 @@ class Client {
3939
return res.body
4040
}
4141

42-
async readOperatorConfig() {
42+
async readOperatorConfig () {
4343
const options = [
44-
{ns: 'redhat-ods-applications', cm: 'codeflare-operator-config', key: 'config.yaml', f: cm => cm.appwrapper?.Config },
45-
{ns: 'mlbatch-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper },
46-
{ns: 'appwrapper-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper }
44+
{ ns: 'redhat-ods-applications', cm: 'codeflare-operator-config', key: 'config.yaml', f: cm => cm.appwrapper?.Config },
45+
{ ns: 'mlbatch-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper },
46+
{ ns: 'appwrapper-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper }
4747
]
4848
for (const opt of options) {
4949
try {
@@ -149,7 +149,7 @@ function reservation (pod) {
149149
}
150150

151151
// check container resource requests against node_resources
152-
function checkContainerResources(namespace, workload, workloadReplicas, container) {
152+
function checkContainerResources (namespace, workload, workloadReplicas, container) {
153153
// selectively merge limits into requests
154154
const resources = {}
155155
for (const k in container.resources?.requests ?? []) {
@@ -181,13 +181,13 @@ function checkContainerResources(namespace, workload, workloadReplicas, containe
181181
}
182182

183183
// warn if the resource:GPU ratio is not proportional to Node resources
184-
if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) {
184+
if (gdr > 0 && ((gpus == 0) || (gpus / gdr < nodeResources['nvidia.com/gpu'] / nodeResources['nvidia.com/roce_gdr']))) {
185185
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`)
186186
}
187-
if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) {
187+
if (gpus > 0 && (cpus > 0) && (cpus / gpus > nodeResources['cpu'] / nodeResources['nvidia.com/gpu'])) {
188188
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`)
189189
}
190-
if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) {
190+
if (gpus > 0 && (mem > 0) && (mem / gpus > k8srp.memoryParser(nodeResources['memory']) / nodeResources['nvidia.com/gpu'])) {
191191
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`)
192192
}
193193

@@ -376,6 +376,10 @@ async function main () {
376376
console.log('WARNING: nominal GPU quota is greater than schedulable GPU count')
377377
}
378378

379+
if (quotaGPUs + slackGPUs > clusterGPUs) {
380+
console.log('WARNING: total GPU quota is greater than total GPU count')
381+
}
382+
379383
// check all accessible namespaces
380384
const namespaces = await client.namespaces()
381385
for (const namespace of namespaces) {

0 commit comments

Comments
 (0)