|
13 | 13 | import reframe.utility.osext as osext |
14 | 14 | import reframe.utility.sanity as sn |
15 | 15 |
|
16 | | -sys.path.append(str(pathlib.Path(__file__).parent / 'mixins')) |
| 16 | +sys.path.append(str(pathlib.Path(__file__).parent.parent.parent / 'mixins')) |
17 | 17 | from uenv_slurm_mpi_options import UenvSlurmMpiOptionsMixin # noqa: E402 |
18 | 18 |
|
19 | 19 |
|
@@ -197,7 +197,8 @@ def assert_found_exceeded_memory(self): |
197 | 197 |
|
198 | 198 |
|
199 | 199 | @rfm.simple_test |
200 | | -class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck, UenvSlurmMpiOptionsMixin): |
| 200 | +class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck, |
| 201 | + UenvSlurmMpiOptionsMixin): |
201 | 202 | # TODO: maintainers = ['@jgphpc', '@ekouts'] |
202 | 203 | descr = 'Tests for max allocatable memory' |
203 | 204 | valid_systems = ['+remote'] |
@@ -441,13 +442,19 @@ class SlurmPrologEpilogCheck(rfm.RunOnlyRegressionTest): |
441 | 442 | epilog_dir = '/etc/slurm/node_epilog.d/' |
442 | 443 | prerun_cmds = [f'ln -s {kafka_logger} ./kafka_logger'] |
443 | 444 | test_files = [] |
444 | | - for file in os.listdir(epilog_dir): |
445 | | - if os.path.isfile(os.path.join(epilog_dir, file)): |
446 | | - test_files.append(os.path.join(epilog_dir, file)) |
447 | | - |
448 | | - for file in os.listdir(prolog_dir): |
449 | | - if os.path.isfile(os.path.join(prolog_dir, file)): |
450 | | - test_files.append(os.path.join(prolog_dir, file)) |
| 445 | + try: |
| 446 | + for file in os.listdir(epilog_dir): |
| 447 | + if os.path.isfile(os.path.join(epilog_dir, file)): |
| 448 | + test_files.append(os.path.join(epilog_dir, file)) |
| 449 | + except PermissionError: |
| 450 | + pass |
| 451 | + |
| 452 | + try: |
| 453 | + for file in os.listdir(prolog_dir): |
| 454 | + if os.path.isfile(os.path.join(prolog_dir, file)): |
| 455 | + test_files.append(os.path.join(prolog_dir, file)) |
| 456 | + except PermissionError: |
| 457 | + pass |
451 | 458 |
|
452 | 459 | test_file = parameter(test_files) |
453 | 460 | tags = {'vs-node-validator'} |
@@ -544,6 +551,28 @@ def validate(self): |
544 | 551 | return sn.assert_not_found(r'\bisolcpus=', self.stdout) |
545 | 552 |
|
546 | 553 |
|
| 554 | +@rfm.simple_test |
| 555 | +class SlurmNoUvmPerfAccessCounterMigration(rfm.RunOnlyRegressionTest): |
| 556 | + valid_systems = ['+remote +scontrol +nvgpu'] |
| 557 | + valid_prog_environs = ['builtin'] |
| 558 | + maintainers = ['msimberg', 'SSA'] |
| 559 | + descr = ''' |
| 560 | + Check that uvm_perf_access_counter_mimc_migration_enable is set to 0 |
| 561 | + as it is buggy in older drivers. |
| 562 | + ''' |
| 563 | + time_limit = '1m' |
| 564 | + num_tasks_per_node = 1 |
| 565 | + sourcesdir = None |
| 566 | + executable = 'cat' |
| 567 | + executable_opts = [('/sys/module/nvidia_uvm/parameters/' |
| 568 | + 'uvm_perf_access_counter_mimc_migration_enable')] |
| 569 | + tags = {'production', 'maintenance', 'slurm'} |
| 570 | + |
| 571 | + @sanity_function |
| 572 | + def validate(self): |
| 573 | + return sn.assert_found(r'0', self.stdout) |
| 574 | + |
| 575 | + |
547 | 576 | @rfm.simple_test |
548 | 577 | class SlurmGPUGresTest(SlurmSimpleBaseCheck): |
549 | 578 | descr = ''' |
@@ -572,11 +601,20 @@ def assert_gres_valid(self): |
572 | 601 | gpu_count = self.current_partition.select_devices('gpu')[0].num_devices |
573 | 602 | part_re = rf'Partitions=\S*{partition_name}' |
574 | 603 | gres_re = rf'gres/gpu={gpu_count} ' |
575 | | - node_count = sn.count(sn.extractall(part_re, self.stdout)) |
576 | | - gres_count = sn.count( |
577 | | - sn.extractall(rf'{part_re}.*{gres_re}', self.stdout)) |
578 | | - return sn.assert_eq( |
579 | | - node_count, gres_count, |
580 | | - f'{gres_count}/{node_count} of ' |
581 | | - f'{partition_name} nodes satisfy {gres_re}' |
| 604 | + node_re = r'NodeName=(\S+)' |
| 605 | + |
| 606 | + all_nodes = sn.evaluate( |
| 607 | + sn.extractall(rf'{node_re}.*{part_re}', self.stdout, 1) |
| 608 | + ) |
| 609 | + good_nodes = sn.evaluate( |
| 610 | + sn.extractall(rf'{node_re}.*{part_re}.*{gres_re}', |
| 611 | + self.stdout, 1) |
| 612 | + ) |
| 613 | + bad_nodes = ','.join(sorted(set(all_nodes) - set(good_nodes))) |
| 614 | + |
| 615 | + return sn.assert_true( |
| 616 | + len(bad_nodes) == 0, |
| 617 | + msg=(f'{len(good_nodes)}/{len(all_nodes)} of ' |
| 618 | + f'{partition_name} nodes satisfy {gres_re}. Bad nodes: ' |
| 619 | + f'{bad_nodes}') |
582 | 620 | ) |
0 commit comments