@@ -4,10 +4,10 @@ const k8s = require('@kubernetes/client-node')
44const k8srp = require ( 'kubernetes-resource-parser' )
55
66const nodeResources = {
7- 'nvidia.com/gpu' : 8 ,
8- 'nvidia.com/roce_gdr' : 2 ,
9- 'cpu' : 80 ,
10- 'memory' : '800G'
7+ 'nvidia.com/gpu' : 8 ,
8+ 'nvidia.com/roce_gdr' : 2 ,
9+ 'cpu' : 80 ,
10+ 'memory' : '800G'
1111}
1212
1313class Client {
@@ -39,11 +39,11 @@ class Client {
3939 return res . body
4040 }
4141
42- async readOperatorConfig ( ) {
42+ async readOperatorConfig ( ) {
4343 const options = [
44- { ns : 'redhat-ods-applications' , cm : 'codeflare-operator-config' , key : 'config.yaml' , f : cm => cm . appwrapper ?. Config } ,
45- { ns : 'mlbatch-system' , cm : 'appwrapper-operator-config' , key : 'config.yaml' , f : cm => cm . appwrapper } ,
46- { ns : 'appwrapper-system' , cm : 'appwrapper-operator-config' , key : 'config.yaml' , f : cm => cm . appwrapper }
44+ { ns : 'redhat-ods-applications' , cm : 'codeflare-operator-config' , key : 'config.yaml' , f : cm => cm . appwrapper ?. Config } ,
45+ { ns : 'mlbatch-system' , cm : 'appwrapper-operator-config' , key : 'config.yaml' , f : cm => cm . appwrapper } ,
46+ { ns : 'appwrapper-system' , cm : 'appwrapper-operator-config' , key : 'config.yaml' , f : cm => cm . appwrapper }
4747 ]
4848 for ( const opt of options ) {
4949 try {
@@ -149,7 +149,7 @@ function reservation (pod) {
149149}
150150
151151// check container resource requests against node_resources
152- function checkContainerResources ( namespace , workload , workloadReplicas , container ) {
152+ function checkContainerResources ( namespace , workload , workloadReplicas , container ) {
153153 // selectively merge limits into requests
154154 const resources = { }
155155 for ( const k in container . resources ?. requests ?? [ ] ) {
@@ -181,13 +181,13 @@ function checkContainerResources(namespace, workload, workloadReplicas, containe
181181 }
182182
183183 // warn if the resource:GPU ratio is not proportional to Node resources
184- if ( gdr > 0 && ( ( gpus == 0 ) || ( gpus / gdr < nodeResources [ 'nvidia.com/gpu' ] / nodeResources [ 'nvidia.com/roce_gdr' ] ) ) ) {
184+ if ( gdr > 0 && ( ( gpus == 0 ) || ( gpus / gdr < nodeResources [ 'nvidia.com/gpu' ] / nodeResources [ 'nvidia.com/roce_gdr' ] ) ) ) {
185185 console . log ( `WARNING: workload "${ namespace . metadata . name } /${ workload . metadata . name } " has a container requesting ${ gdr } roce_gdr but only ${ gpus } GPUs` )
186186 }
187- if ( gpus > 0 && ( cpus > 0 ) && ( cpus / gpus > nodeResources [ 'cpu' ] / nodeResources [ 'nvidia.com/gpu' ] ) ) {
187+ if ( gpus > 0 && ( cpus > 0 ) && ( cpus / gpus > nodeResources [ 'cpu' ] / nodeResources [ 'nvidia.com/gpu' ] ) ) {
188188 console . log ( `WARNING: workload "${ namespace . metadata . name } /${ workload . metadata . name } " has a container requesting ${ cpus } cpus but only ${ gpus } GPUs` )
189189 }
190- if ( gpus > 0 && ( mem > 0 ) && ( mem / gpus > k8srp . memoryParser ( nodeResources [ 'memory' ] ) / nodeResources [ 'nvidia.com/gpu' ] ) ) {
190+ if ( gpus > 0 && ( mem > 0 ) && ( mem / gpus > k8srp . memoryParser ( nodeResources [ 'memory' ] ) / nodeResources [ 'nvidia.com/gpu' ] ) ) {
191191 console . log ( `WARNING: workload "${ namespace . metadata . name } /${ workload . metadata . name } " has a container requesting ${ resources [ 'memory' ] } memory but only ${ gpus } GPUs` )
192192 }
193193
@@ -376,6 +376,10 @@ async function main () {
376376 console . log ( 'WARNING: nominal GPU quota is greater than schedulable GPU count' )
377377 }
378378
379+ if ( quotaGPUs + slackGPUs > clusterGPUs ) {
380+ console . log ( 'WARNING: total GPU quota is greater than total GPU count' )
381+ }
382+
379383 // check all accessible namespaces
380384 const namespaces = await client . namespaces ( )
381385 for ( const namespace of namespaces ) {
0 commit comments