Skip to content

Commit 2b8fcc1

Browse files
authored
Add bad nodes in SlurmGPUGresTest sanity message (#495)
1 parent 8f34718 commit 2b8fcc1

File tree

1 file changed

+16
-7
lines changed

1 file changed

+16
-7
lines changed

checks/system/slurm/slurm.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -578,11 +578,20 @@ def assert_gres_valid(self):
578578
gpu_count = self.current_partition.select_devices('gpu')[0].num_devices
579579
part_re = rf'Partitions=\S*{partition_name}'
580580
gres_re = rf'gres/gpu={gpu_count} '
581-
node_count = sn.count(sn.extractall(part_re, self.stdout))
582-
gres_count = sn.count(
583-
sn.extractall(rf'{part_re}.*{gres_re}', self.stdout))
584-
return sn.assert_eq(
585-
node_count, gres_count,
586-
f'{gres_count}/{node_count} of '
587-
f'{partition_name} nodes satisfy {gres_re}'
581+
node_re = r'NodeName=(\S+)'
582+
583+
all_nodes = sn.evaluate(
584+
sn.extractall(rf'{node_re}.*{part_re}', self.stdout, 1)
585+
)
586+
good_nodes = sn.evaluate(
587+
sn.extractall(rf'{node_re}.*{part_re}.*{gres_re}',
588+
self.stdout, 1)
589+
)
590+
bad_nodes = ','.join(sorted(set(all_nodes) - set(good_nodes)))
591+
592+
return sn.assert_true(
593+
len(bad_nodes) == 0,
594+
msg=(f'{len(good_nodes)}/{len(all_nodes)} of '
595+
f'{partition_name} nodes satisfy {gres_re}. Bad nodes: '
596+
f'{bad_nodes}')
588597
)

0 commit comments

Comments
 (0)