Skip to content

Commit 7d9cef5

Browse files
authored
Merge branch 'main' into update_enroot_import_dockerhub
2 parents 9157f17 + d37e026 commit 7d9cef5

File tree

5 files changed

+23
-56
lines changed

5 files changed

+23
-56
lines changed

checks/apps/pytorch/pytorch_megatronlm.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -501,37 +501,15 @@ def throughput_per_gpu(self):
501501
))
502502

503503

504-
class pytorch_image_import(rfm.RunOnlyRegressionTest):
505-
image = variable(
506-
str,
507-
value=('docker://jfrog.svc.cscs.ch#reframe-oci/'
508-
'pytorch:25.01-py3_nvrtc-12.9')
509-
)
510-
archive_name = 'pytorch.sqsh'
511-
executable = 'enroot'
512-
valid_systems = ['+ce']
513-
valid_prog_environs = ['builtin']
514-
515-
@run_before('run')
516-
def set_executable_opts(self):
517-
self.executable_opts = ['import', '-o', self.archive_name, self.image]
518-
519-
@sanity_function
520-
def assert_image_imported(self):
521-
return sn.path_exists(os.path.join(self.stagedir, self.archive_name))
522-
523-
524504
@rfm.simple_test
525505
class PyTorchMegatronLM_CE(PyTorchMegatronLM, ContainerEngineMixin):
526506
valid_systems = ['+nvgpu +ce']
527507
valid_prog_environs = ['builtin']
528508
maintainers = ['ml-team']
529-
pytorch_image = fixture(pytorch_image_import, scope='session')
509+
container_image = 'docker://jfrog.svc.cscs.ch#reframe-oci/pytorch:25.01-py3_nvrtc-12.9'
530510

531511
@run_after('setup')
532512
def set_container_config(self):
533-
self.container_image = os.path.join(self.pytorch_image.stagedir,
534-
self.pytorch_image.archive_name)
535513
self.container_env_table = {
536514
'annotations.com.hooks': {
537515
'aws_ofi_nccl.enabled': 'true',

checks/apps/pytorch/pytorch_megatronlm_amd.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
class PyTorchMegatronLM_AMD(rfm.RunOnlyRegressionTest):
1919
num_tasks_per_node = 1
2020
default_num_nodes = variable(int, type(None), value=None)
21-
time_limit = '30m'
21+
time_limit = '50m'
2222
megatron_repo = variable(
2323
str, value='https://github.com/ROCm/Megatron-LM'
2424
)
@@ -379,37 +379,15 @@ def throughput_per_gpu(self):
379379
))
380380

381381

382-
class pytorch_image_import(rfm.RunOnlyRegressionTest):
383-
sourcesdir = None
384-
image = variable(
385-
str,
386-
value=('docker://rocm/megatron-lm:v25.6_py312')
387-
)
388-
archive_name = 'pytorch.sqsh'
389-
executable = 'enroot'
390-
valid_systems = ['+ce']
391-
valid_prog_environs = ['builtin']
392-
393-
@run_before('run')
394-
def set_executable_opts(self):
395-
self.executable_opts = ['import', '-o', self.archive_name, self.image]
396-
397-
@sanity_function
398-
def assert_image_imported(self):
399-
return sn.path_exists(os.path.join(self.stagedir, self.archive_name))
400-
401-
402382
@rfm.simple_test
403383
class PyTorchMegatronLM_AMD_CE(PyTorchMegatronLM_AMD, ContainerEngineMixin):
404384
valid_systems = ['+amdgpu +ce']
405385
valid_prog_environs = ['builtin']
406386
maintainers = ['ml-team']
407-
pytorch_image = fixture(pytorch_image_import, scope='session')
387+
container_image = 'rocm/megatron-lm:v25.6_py312'
408388

409389
@run_after('setup')
410390
def set_container_config(self):
411-
self.container_image = os.path.join(self.pytorch_image.stagedir,
412-
self.pytorch_image.archive_name)
413391
self.container_env_table = {
414392
'annotations.com.hooks': {
415393
'aws_ofi_nccl.enabled': 'true',

checks/prgenv/affinity_check.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class CompileAffinityTool(rfm.CompileOnlyRegressionTest,
2727
env_vars = {'MPICH_GPU_SUPPORT_ENABLED': 0}
2828

2929
sourcesdir = 'https://github.com/vkarak/affinity'
30-
tags = {'production', 'scs', 'maintenance', 'craype'}
30+
tags = {'scs', 'craype'}
3131

3232
@run_before('compile')
3333
def set_build_opts(self):
@@ -76,7 +76,7 @@ class AffinityTestBase(rfm.RunOnlyRegressionTest,
7676
'+openmp +prgenv'
7777
]
7878

79-
tags = {'production', 'scs', 'maintenance', 'craype'}
79+
tags = {'scs', 'craype'}
8080

8181
@run_after('setup')
8282
def skip_cpe_2312(self):

checks/prgenv/mpi.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111

1212
sys.path.append(str(pathlib.Path(__file__).parent.parent / 'mixins'))
1313
from container_engine import ContainerEngineCPEMixin # noqa: E402
14+
from uenv_slurm_mpi_options import UenvSlurmMpiOptionsMixin # noqa: E402
1415

1516

1617
@rfm.simple_test
17-
class MpiInitTest(rfm.RegressionTest, ContainerEngineCPEMixin):
18+
class MpiInitTest(rfm.RegressionTest, ContainerEngineCPEMixin, UenvSlurmMpiOptionsMixin):
1819
'''
1920
This test checks the value returned by calling MPI_Init_thread.
2021
'''
@@ -55,9 +56,11 @@ def set_sanity(self):
5556
# - 7.7.15 (ANL base 3.2)
5657
# - 8.0.16.17 (ANL base 3.3)
5758
# - 8.1.4.31,8.1.5.32,8.1.18.4,8.1.21.11,8.1.25.17 (ANL base 3.4a2)
58-
regex = r'= MPI VERSION\s+: CRAY MPICH version \S+ \(ANL base (\S+)\)'
59+
# OpenMPI version:
60+
# - MPI-3.1 = Open MPI v5.0.9
61+
regex = r'= (MPI VERSION\s+: CRAY MPICH version \S+ \(ANL base |Open MPI v)([\S^\)]+)'
5962
stdout = os.path.join(self.stagedir, sn.evaluate(self.stdout))
60-
mpich_version = sn.extractsingle(regex, stdout, 1)
63+
mpich_version = sn.extractsingle(regex, stdout, 2)
6164
self.mpithread_version = {
6265
'3.2': {
6366
'MPI_THREAD_SINGLE': 0,
@@ -125,7 +128,7 @@ def set_sanity(self):
125128

126129

127130
@rfm.simple_test
128-
class MpiGpuDirectOOM(rfm.RegressionTest, ContainerEngineCPEMixin):
131+
class MpiGpuDirectOOM(rfm.RegressionTest, ContainerEngineCPEMixin, UenvSlurmMpiOptionsMixin):
129132
'''
130133
This test checks the issue reported in:
131134
https://github.com/eth-cscs/alps-gh200-reproducers/tree/main/gpudirect-oom

checks/system/slurm/slurm.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@
44
# SPDX-License-Identifier: BSD-3-Clause
55

66
import os
7+
import pathlib
78
import re
9+
import sys
810

911
import reframe as rfm
1012
import reframe.core.runtime as rt
1113
import reframe.utility.osext as osext
1214
import reframe.utility.sanity as sn
1315

16+
sys.path.append(str(pathlib.Path(__file__).parent / 'mixins'))
17+
from uenv_slurm_mpi_options import UenvSlurmMpiOptionsMixin # noqa: E402
18+
1419

1520
class SlurmSimpleBaseCheck(rfm.RunOnlyRegressionTest):
1621
'''Base class for Slurm simple binary tests'''
@@ -192,7 +197,7 @@ def assert_found_exceeded_memory(self):
192197

193198

194199
@rfm.simple_test
195-
class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck):
200+
class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck, UenvSlurmMpiOptionsMixin):
196201
# TODO: maintainers = ['@jgphpc', '@ekouts']
197202
descr = 'Tests for max allocatable memory'
198203
valid_systems = ['+remote']
@@ -207,8 +212,11 @@ class MemoryOverconsumptionCheckMPI(SlurmCompiledBaseCheck):
207212
def set_num_tasks(self):
208213
self.skip_if_no_procinfo()
209214
cpu = self.current_partition.processor
210-
self.num_tasks_per_node = int(
211-
cpu.info['num_cpus'] / cpu.info['num_cpus_per_core'])
215+
# Limit number of tasks because PMIx/OpenMPI can take very long to
216+
# initialize with e.g. 288 ranks on one GH200 node. The test still
217+
# fails in a reasonable time with a limited number of ranks.
218+
self.num_tasks_per_node = min(16, int(
219+
cpu.info['num_cpus'] / cpu.info['num_cpus_per_core']))
212220
self.num_tasks = self.num_tasks_per_node
213221
self.job.launcher.options += ['-u']
214222

0 commit comments

Comments
 (0)