Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions checks/prgenv/mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@

sys.path.append(str(pathlib.Path(__file__).parent.parent / 'mixins'))
from container_engine import ContainerEngineCPEMixin # noqa: E402
from uenv_slurm_mpi_options import UenvSlurmMpiOptionsMixin # noqa: E402


@rfm.simple_test
class MpiInitTest(rfm.RegressionTest, ContainerEngineCPEMixin):
class MpiInitTest(rfm.RegressionTest, ContainerEngineCPEMixin, UenvSlurmMpiOptionsMixin):
'''
This test checks the value returned by calling MPI_Init_thread.
'''
Expand Down Expand Up @@ -55,9 +56,11 @@ def set_sanity(self):
# - 7.7.15 (ANL base 3.2)
# - 8.0.16.17 (ANL base 3.3)
# - 8.1.4.31,8.1.5.32,8.1.18.4,8.1.21.11,8.1.25.17 (ANL base 3.4a2)
regex = r'= MPI VERSION\s+: CRAY MPICH version \S+ \(ANL base (\S+)\)'
# OpenMPI version:
# - MPI-3.1 = Open MPI v5.0.9
regex = r'= (MPI VERSION\s+: CRAY MPICH version \S+ \(ANL base |Open MPI v)([\S^\)]+)'
stdout = os.path.join(self.stagedir, sn.evaluate(self.stdout))
mpich_version = sn.extractsingle(regex, stdout, 1)
mpich_version = sn.extractsingle(regex, stdout, 2)
self.mpithread_version = {
'3.2': {
'MPI_THREAD_SINGLE': 0,
Expand Down Expand Up @@ -125,7 +128,7 @@ def set_sanity(self):


@rfm.simple_test
class MpiGpuDirectOOM(rfm.RegressionTest, ContainerEngineCPEMixin):
class MpiGpuDirectOOM(rfm.RegressionTest, ContainerEngineCPEMixin, UenvSlurmMpiOptionsMixin):
'''
This test checks the issue reported in:
https://github.com/eth-cscs/alps-gh200-reproducers/tree/main/gpudirect-oom
Expand Down
14 changes: 11 additions & 3 deletions checks/system/slurm/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@
# SPDX-License-Identifier: BSD-3-Clause

import os
import pathlib
import re
import sys

import reframe as rfm
import reframe.core.runtime as rt
import reframe.utility.osext as osext
import reframe.utility.sanity as sn

sys.path.append(str(pathlib.Path(__file__).parent / 'mixins'))
from uenv_slurm_mpi_options import UenvSlurmMpiOptionsMixin # noqa: E402


class SlurmSimpleBaseCheck(rfm.RunOnlyRegressionTest):
'''Base class for Slurm simple binary tests'''
Expand Down Expand Up @@ -192,7 +197,7 @@ def assert_found_exceeded_memory(self):


@rfm.simple_test
class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck):
class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck, UenvSlurmMpiOptionsMixin):
# TODO: maintainers = ['@jgphpc', '@ekouts']
descr = 'Tests for max allocatable memory'
valid_systems = ['+remote']
Expand All @@ -207,8 +212,11 @@ class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck):
def set_num_tasks(self):
self.skip_if_no_procinfo()
cpu = self.current_partition.processor
self.num_tasks_per_node = int(
cpu.info['num_cpus'] / cpu.info['num_cpus_per_core'])
# Limit number of tasks because PMIx/OpenMPI can take very long to
# initialize with e.g. 288 ranks on one GH200 node. The test still
# fails in a reasonable time with a limited number of ranks.
self.num_tasks_per_node = min(16, int(
cpu.info['num_cpus'] / cpu.info['num_cpus_per_core']))
self.num_tasks = self.num_tasks_per_node
self.job.launcher.options += ['-u']

Expand Down