Skip to content

Commit c9d456e

Browse files
authored
Merge branch 'main' into regression-plugin
2 parents aa23222 + 0f99c77 commit c9d456e

File tree

4 files changed

+93
-34
lines changed

4 files changed

+93
-34
lines changed

checks/apps/cp2k/cp2k_uenv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515

1616
cp2k_references = {
1717
'md': {
18-
'gh200': {'time_run': (69, None, 0.05, 's')},
18+
'gh200': {'time_run': (45, None, 0.05, 's')},
1919
'zen2': {'time_run': (91, None, 0.05, 's')}
2020
},
2121
'pbe': {
22-
'gh200': {'time_run': (67, None, 0.05, 's')},
22+
'gh200': {'time_run': (50, None, 0.05, 's')},
2323
'zen2': {'time_run': (68, None, 0.05, 's')}
2424
},
2525
'rpa': {
Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,33 @@
11
#!/bin/bash
22

3-
set -u
3+
set -eu
44

5-
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
6-
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
7-
export CUDA_VISIBLE_DEVICES=$(( SLURM_LOCALID % 4 ))
5+
export CUDA_DEVICE_MAX_COPY_CONNECTIONS=8
6+
export CUDA_DEVICE_MAX_CONNECTIONS=8
87

9-
if [ "${SLURM_LOCALID}" -eq 0 ]; then
10-
CUDA_VISIBLE_DEVICES=0,1,2,3 nvidia-cuda-mps-control -d
8+
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps-$((SLURM_LOCALID % 4))
9+
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log-$((SLURM_LOCALID % 4))-$(id -un)
10+
11+
export HWLOC_KEEP_NVIDIA_GPU_NUMA_NODES=0
12+
numa_nodes=$(hwloc-calc --physical --intersect NUMAnode $(hwloc-bind --get --taskset)) # do not set CUDA_VISIBLE_DEVICES, enough to set it for the daemon
13+
14+
# Launch MPS from a single rank per GPU
15+
if [[ $SLURM_LOCALID -lt 4 ]]; then
16+
mkdir -p ${CUDA_MPS_PIPE_DIRECTORY}
17+
mkdir -p ${CUDA_MPS_LOG_DIRECTORY}
18+
CUDA_VISIBLE_DEVICES=$((SLURM_LOCALID % 4)) nvidia-cuda-mps-control -d
1119
fi
1220

13-
sleep 5
21+
# Wait for MPS to start
22+
sleep 1
1423

15-
exec "$@"
24+
# Run the command
25+
"$@"
26+
result=$?
1627

17-
if [ "${SLURM_LOCALID}" -eq 0 ]; then
28+
# Quit MPS control daemon before exiting
29+
if [[ $SLURM_LOCALID -lt 4 ]]; then
1830
echo quit | nvidia-cuda-mps-control
1931
fi
32+
33+
exit $result

checks/apps/gromacs/gromacs_check.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,16 +96,21 @@ def prepare_build(self):
9696
)
9797
self.prebuild_cmds = [f'tar --strip-components=1 -xzf {tarsource}']
9898
self.build_system.config_opts = [
99-
'-DREGRESSIONTEST_DOWNLOAD=ON',
100-
'-DGMX_MPI=on',
101-
'-DGMX_BUILD_OWN_FFTW=ON',
102-
'-DCP2K_USE_SPGLIB=ON',
99+
'-DREGRESSIONTEST_DOWNLOAD=OFF',
100+
'-DGMX_MPI=ON',
101+
'-DGMX_BUILD_OWN_FFTW=OFF',
103102
'-DGMX_HWLOC=ON',
104-
'-DGMX_SIMD=ARM_NEON_ASIMD',
105103
'-DGMX_INSTALL_NBLIB_API=ON',
106104
]
107105
if self.uarch == 'gh200':
108-
self.build_system.config_opts += ['-DGMX_GPU=CUDA']
106+
self.build_system.config_opts += [
107+
'-DGMX_SIMD=ARM_NEON_ASIMD',
108+
'-DGMX_GPU=CUDA'
109+
]
110+
elif self.uarch == 'zen2':
111+
self.build_system.config_opts += [
112+
'-DGMX_SIMD=AUTO',
113+
]
109114

110115
@sanity_function
111116
def validate_test(self):
@@ -115,7 +120,7 @@ def validate_test(self):
115120

116121
@rfm.simple_test
117122
class gromacs_run_test(rfm.RunOnlyRegressionTest):
118-
executable = './mps-wrapper.sh -- gmx_mpi mdrun -s stmv2.tpr'
123+
executable = 'gmx_mpi mdrun -s stmv2.tpr'
119124
executable_opts = [
120125
'-dlb no', '-npme 1', '-pin off', '-v', '-noconfout', '-nstlist 300'
121126
]
@@ -143,6 +148,8 @@ def prepare_run(self):
143148
self.executable_opts.append(f'-ntomp {self.num_cpus_per_task}')
144149

145150
if self.uarch == 'gh200':
151+
self.executable = './mps-wrapper.sh -- ' + self.executable
152+
146153
self.executable_opts += [
147154
'-pme gpu', '-nb gpu', '-update gpu', '-nsteps 10000'
148155
]

checks/system/slurm/slurm.py

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import reframe.utility.osext as osext
1414
import reframe.utility.sanity as sn
1515

16-
sys.path.append(str(pathlib.Path(__file__).parent / 'mixins'))
16+
sys.path.append(str(pathlib.Path(__file__).parent.parent.parent / 'mixins'))
1717
from uenv_slurm_mpi_options import UenvSlurmMpiOptionsMixin # noqa: E402
1818

1919

@@ -197,7 +197,8 @@ def assert_found_exceeded_memory(self):
197197

198198

199199
@rfm.simple_test
200-
class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck, UenvSlurmMpiOptionsMixin):
200+
class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck,
201+
UenvSlurmMpiOptionsMixin):
201202
# TODO: maintainers = ['@jgphpc', '@ekouts']
202203
descr = 'Tests for max allocatable memory'
203204
valid_systems = ['+remote']
@@ -441,13 +442,19 @@ class SlurmPrologEpilogCheck(rfm.RunOnlyRegressionTest):
441442
epilog_dir = '/etc/slurm/node_epilog.d/'
442443
prerun_cmds = [f'ln -s {kafka_logger} ./kafka_logger']
443444
test_files = []
444-
for file in os.listdir(epilog_dir):
445-
if os.path.isfile(os.path.join(epilog_dir, file)):
446-
test_files.append(os.path.join(epilog_dir, file))
447-
448-
for file in os.listdir(prolog_dir):
449-
if os.path.isfile(os.path.join(prolog_dir, file)):
450-
test_files.append(os.path.join(prolog_dir, file))
445+
try:
446+
for file in os.listdir(epilog_dir):
447+
if os.path.isfile(os.path.join(epilog_dir, file)):
448+
test_files.append(os.path.join(epilog_dir, file))
449+
except PermissionError:
450+
pass
451+
452+
try:
453+
for file in os.listdir(prolog_dir):
454+
if os.path.isfile(os.path.join(prolog_dir, file)):
455+
test_files.append(os.path.join(prolog_dir, file))
456+
except PermissionError:
457+
pass
451458

452459
test_file = parameter(test_files)
453460
tags = {'vs-node-validator'}
@@ -544,6 +551,28 @@ def validate(self):
544551
return sn.assert_not_found(r'\bisolcpus=', self.stdout)
545552

546553

554+
@rfm.simple_test
555+
class SlurmNoUvmPerfAccessCounterMigration(rfm.RunOnlyRegressionTest):
556+
valid_systems = ['+remote +scontrol +nvgpu']
557+
valid_prog_environs = ['builtin']
558+
maintainers = ['msimberg', 'SSA']
559+
descr = '''
560+
Check that uvm_perf_access_counter_mimc_migration_enable is set to 0
561+
as it is buggy in older drivers.
562+
'''
563+
time_limit = '1m'
564+
num_tasks_per_node = 1
565+
sourcesdir = None
566+
executable = 'cat'
567+
executable_opts = [('/sys/module/nvidia_uvm/parameters/'
568+
'uvm_perf_access_counter_mimc_migration_enable')]
569+
tags = {'production', 'maintenance', 'slurm'}
570+
571+
@sanity_function
572+
def validate(self):
573+
return sn.assert_found(r'0', self.stdout)
574+
575+
547576
@rfm.simple_test
548577
class SlurmGPUGresTest(SlurmSimpleBaseCheck):
549578
descr = '''
@@ -572,11 +601,20 @@ def assert_gres_valid(self):
572601
gpu_count = self.current_partition.select_devices('gpu')[0].num_devices
573602
part_re = rf'Partitions=\S*{partition_name}'
574603
gres_re = rf'gres/gpu={gpu_count} '
575-
node_count = sn.count(sn.extractall(part_re, self.stdout))
576-
gres_count = sn.count(
577-
sn.extractall(rf'{part_re}.*{gres_re}', self.stdout))
578-
return sn.assert_eq(
579-
node_count, gres_count,
580-
f'{gres_count}/{node_count} of '
581-
f'{partition_name} nodes satisfy {gres_re}'
604+
node_re = r'NodeName=(\S+)'
605+
606+
all_nodes = sn.evaluate(
607+
sn.extractall(rf'{node_re}.*{part_re}', self.stdout, 1)
608+
)
609+
good_nodes = sn.evaluate(
610+
sn.extractall(rf'{node_re}.*{part_re}.*{gres_re}',
611+
self.stdout, 1)
612+
)
613+
bad_nodes = ','.join(sorted(set(all_nodes) - set(good_nodes)))
614+
615+
return sn.assert_true(
616+
len(bad_nodes) == 0,
617+
msg=(f'{len(good_nodes)}/{len(all_nodes)} of '
618+
f'{partition_name} nodes satisfy {gres_re}. Bad nodes: '
619+
f'{bad_nodes}')
582620
)

0 commit comments

Comments
 (0)