Skip to content

Commit 1241c1e

Browse files
authored
warn when a single pod workload is using roce_gdr (#64)
1 parent 41d9e11 commit 1241c1e

File tree

1 file changed

+12
-3
lines changed

1 file changed

+12
-3
lines changed

tools/cluster-checker/checker.js

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ function reservation (pod) {
149149
}
150150

151151
// check container resource requests against node_resources
152-
function checkContainerResources(namespace, workload, container) {
152+
function checkContainerResources(namespace, workload, workloadReplicas, container) {
153153
// selectively merge limits into requests
154154
const resources = {}
155155
for (const k in container.resources?.requests ?? []) {
@@ -190,6 +190,11 @@ function checkContainerResources(namespace, workload, container) {
190190
if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) {
191191
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`)
192192
}
193+
194+
// warn if other resource constraints are violated
195+
if (gdr > 0 && workloadReplicas < 2) {
196+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" is a single pod workload that is requesting ${gdr} roce_gdr`)
197+
}
193198
}
194199

195200
// check user namespace
@@ -225,12 +230,16 @@ async function checkUserNamespace (client, namespace, queues) {
225230
}
226231

227232
// report misconfigured resource requests
233+
let replicas = 0
234+
for (const podSet of workload.spec?.podSets) {
235+
replicas += podSet.count ?? 0
236+
}
228237
for (const podSet of workload.spec?.podSets) {
229238
for (const ic of podSet.template?.spec?.initContainers ?? []) {
230-
checkContainerResources(namespace, workload, ic)
239+
checkContainerResources(namespace, workload, replicas, ic)
231240
}
232241
for (const c of podSet.template?.spec?.containers ?? []) {
233-
checkContainerResources(namespace, workload, c)
242+
checkContainerResources(namespace, workload, replicas, c)
234243
}
235244
}
236245
}

0 commit comments

Comments
 (0)