Skip to content

Commit 6fa56d4

Browse files
msimbergjgphpc
andauthored
Add dla-future reframe test (#266)
Co-authored-by: Jean-guillaume Piccinali <jgphpc@users.noreply.github.com>
1 parent f9467a6 commit 6fa56d4

File tree

2 files changed

+271
-103
lines changed

2 files changed

+271
-103
lines changed

checks/apps/lammps/lammps_check.py

Lines changed: 130 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1,119 +1,146 @@
1-
# Copyright 2016-2022 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
1+
# Copyright Swiss National Supercomputing Centre (CSCS/ETH Zurich)
22
# ReFrame Project Developers. See the top-level LICENSE file for details.
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
5-
65
import os
76

87
import reframe as rfm
98
import reframe.utility.sanity as sn
10-
11-
12-
class LAMMPSCheck(rfm.RunOnlyRegressionTest):
13-
scale = parameter(['small', 'large'])
14-
modules = ['cray-python', 'LAMMPS']
15-
tags = {'external-resources', 'maintenance', 'production'}
16-
maintainers = ['LM']
17-
strict_check = False
18-
extra_resources = {
19-
'switches': {
20-
'num_switches': 1
21-
}
22-
}
23-
24-
@run_after('init')
25-
def setup_by_system(self):
26-
# Reset sources dir relative to the SCS apps prefix
27-
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
28-
'LAMMPS')
29-
if self.current_system.name in ['eiger', 'pilatus']:
30-
self.valid_prog_environs = ['cpeGNU']
31-
else:
32-
self.valid_prog_environs = ['builtin']
33-
34-
@performance_function('timesteps/s')
35-
def perf(self):
36-
return sn.extractsingle(r'\s+(?P<perf>\S+) timesteps/s',
37-
self.stdout, 'perf', float)
9+
from uenv import uarch
10+
11+
lammps_references = {
12+
'lj': {'gh200': {'time_run': (345, None, 0.05, 's')}},
13+
}
14+
15+
slurm_config = {
16+
"lj": {
17+
"gh200": {
18+
"nodes": 2,
19+
"ntasks-per-node": 32,
20+
"walltime": "10m",
21+
"gpu": True,
22+
},
23+
},
24+
}
25+
26+
27+
class lammps_download(rfm.RunOnlyRegressionTest):
28+
descr = 'Download LAMMPS source code'
29+
version = variable(str, value='20230802.3')
30+
sourcesdir = None
31+
executable = 'wget'
32+
executable_opts = [
33+
'--quiet',
34+
'https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/lammps/'
35+
'LAMMPS_20230802.3_Source.tar.gz',
36+
# 'https://download.lammps.org/tars/lammps-2Aug2023.tar.gz',
37+
]
38+
local = True
3839

3940
@sanity_function
40-
def assert_energy_diff(self):
41-
energy_reference = -4.6195
42-
energy = sn.extractsingle(
43-
r'\s+500000(\s+\S+){3}\s+(?P<energy>\S+)\s+\S+\s\n',
44-
self.stdout, 'energy', float)
45-
energy_diff = sn.abs(energy - energy_reference)
46-
return sn.all([
47-
sn.assert_found(r'Total wall time:', self.stdout),
48-
sn.assert_lt(energy_diff, 6e-4)
49-
])
41+
def validate_download(self):
42+
return sn.assert_eq(self.job.exitcode, 0)
5043

5144

5245
@rfm.simple_test
53-
class LAMMPSGPUCheck(LAMMPSCheck):
54-
valid_systems = []
55-
executable = 'lmp_mpi'
56-
executable_opts = ['-sf gpu', '-pk gpu 1', '-in in.lj.gpu']
57-
env_vars = {'CRAY_CUDA_MPS': 1}
58-
num_gpus_per_node = 1
59-
refs_by_scale = {
60-
'small': {
61-
'dom:gpu': {'perf': (3456.792, -0.10, None, 'timesteps/s')},
62-
'daint:gpu': {'perf': (1566.979, -0.10, None, 'timesteps/s')}
63-
},
64-
'large': {
65-
'daint:gpu': {'perf': (2108.561, -0.10, None, 'timesteps/s')}
66-
}
67-
}
68-
69-
@run_after('init')
70-
def setup_by_scale(self):
71-
self.descr = f'LAMMPS GPU check (version: {self.scale})'
72-
if self.scale == 'small':
73-
self.valid_systems += []
74-
self.num_tasks = 12
75-
self.num_tasks_per_node = 2
76-
else:
77-
self.num_tasks = 32
78-
self.num_tasks_per_node = 2
46+
class lammps_build_test(rfm.CompileOnlyRegressionTest):
47+
'''
48+
Test LAMMPS build from source using the develop-kokkos view
49+
'''
50+
descr = 'LAMMPS Build Test'
51+
valid_prog_environs = ['+lammps-kokkos-dev']
52+
valid_systems = ['*']
53+
maintainers = ['SSA']
54+
sourcesdir = None
55+
lammps_sources = fixture(lammps_download, scope='session')
56+
build_system = 'CMake'
57+
tags = {'uenv'}
58+
build_locally = False
59+
60+
@run_before('compile')
61+
def prepare_build(self):
62+
self.build_system.builddir = 'build'
63+
self.build_system.config_opts = [
64+
f'-C ../lammps-2Aug2023/cmake/presets/kokkos-cuda.cmake',
65+
'-DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF',
66+
'-DKokkos_ARCH_NATIVE=ON',
67+
'-DKokkos_ARCH_PASCAL60=OFF',
68+
'-DKokkos_ARCH_HOPPER90=ON',
69+
'../lammps-2Aug2023/cmake/',
70+
]
71+
self.build_system.max_concurrency = 64
72+
tarsource = os.path.join(
73+
self.lammps_sources.stagedir,
74+
f'LAMMPS_{self.lammps_sources.version}_Source.tar.gz',
75+
)
76+
# Extract source code
77+
self.prebuild_cmds = [f'tar zxf {tarsource}']
7978

80-
self.reference = self.refs_by_scale[self.scale]
79+
@sanity_function
80+
def validate_test(self):
81+
self.lammps_executable = os.path.join(self.stagedir, "build", "lmp")
82+
return os.path.isfile(self.lammps_executable)
8183

8284

8385
@rfm.simple_test
84-
class LAMMPSCPUCheck(LAMMPSCheck):
85-
valid_systems = ['eiger:mc', 'pilatus:mc']
86-
refs_by_scale = {
87-
'small': {
88-
'eiger:mc': {'perf': (3807.095, -0.10, None, 'timesteps/s')},
89-
'pilatus:mc': {'perf': (4828.986, -0.10, None, 'timesteps/s')}
90-
},
91-
'large': {
92-
'eiger:mc': {'perf': (4922.81, -0.10, None, 'timesteps/s')},
93-
'pilatus:mc': {'perf': (7247.484, -0.10, None, 'timesteps/s')}
94-
}
95-
}
96-
97-
@run_after('init')
98-
def setup_by_scale(self):
99-
self.descr = f'LAMMPS CPU check (version: {self.scale})'
100-
if self.current_system.name in ['eiger', 'pilatus']:
101-
self.executable = 'lmp_mpi'
102-
self.executable_opts = ['-in in.lj.cpu']
103-
else:
104-
self.executable = 'lmp_omp'
105-
self.executable_opts = ['-sf omp', '-pk omp 1', '-in in.lj.cpu']
106-
107-
if self.scale == 'small':
108-
self.valid_systems += []
109-
self.num_tasks = 216
110-
self.num_tasks_per_node = 36
111-
else:
112-
self.num_tasks_per_node = 36
113-
self.num_tasks = 576
114-
115-
if self.current_system.name == 'eiger':
116-
self.num_tasks_per_node = 128
117-
self.num_tasks = 256 if self.scale == 'small' else 512
118-
119-
self.reference = self.refs_by_scale[self.scale]
86+
class lammps_gpu_test(rfm.RunOnlyRegressionTest):
87+
"""
88+
Test LAMMPS run using the run-gpu:gpu view
89+
Untested views:
90+
build-gpu: develop-gpu
91+
build-kokkos: develop-kokkos
92+
run-kokkos: kokkos
93+
"""
94+
executable = './mps-wrapper.sh lmp'
95+
valid_prog_environs = ['+lammps-gpu-prod']
96+
valid_systems = ["*"]
97+
maintainers = ["SSA"]
98+
test_name = variable(str, value='lj')
99+
energy_reference = -4.620456
100+
101+
@run_before("run")
102+
def prepare_run(self):
103+
self.uarch = uarch(self.current_partition)
104+
config = slurm_config[self.test_name][self.uarch]
105+
self.extra_resources = {"gres": {"gpu": 4}}
106+
self.job.options = [f'--nodes={config["nodes"]}']
107+
self.num_tasks_per_node = config["ntasks-per-node"]
108+
self.num_tasks = config["nodes"] * self.num_tasks_per_node
109+
self.ntasks_per_core = 1
110+
self.time_limit = config["walltime"]
111+
self.executable_opts = [f'-i {self.test_name}.in']
112+
113+
if self.uarch == "gh200":
114+
self.env_vars["MPICH_GPU_SUPPORT_ENABLED"] = "1"
115+
116+
@run_before("run")
117+
def prepare_reference(self):
118+
self.uarch = uarch(self.current_partition)
119+
if self.uarch is not None and \
120+
self.uarch in lammps_references[self.test_name]:
121+
self.reference = {
122+
self.current_partition.fullname:
123+
lammps_references[self.test_name][self.uarch]
124+
}
125+
126+
@sanity_function
127+
def assert_energy_diff(self):
128+
successful_termination = \
129+
sn.assert_found(r"Total wall time", self.stdout)
130+
131+
energy = sn.extractsingle(
132+
r'^\s*1000(\s+\S+){5}\s+(?P<energy>-?\d+\.\d+)\s+',
133+
self.stdout, "energy", float)
134+
energy_diff = sn.abs(energy - self.energy_reference)
135+
correct_energy = sn.assert_lt(energy_diff, 1e-4)
136+
137+
return sn.all([successful_termination, correct_energy])
138+
139+
# INFO: The name of this function needs to match with the reference dict!
140+
@performance_function('s')
141+
def time_run(self):
142+
regex = r'Total wall time: (?P<hh>\S+):(?P<mm>\S+):(?P<ss>\S+)'
143+
hh = sn.extractsingle(regex, self.stdout, 'hh', int)
144+
mm = sn.extractsingle(regex, self.stdout, 'mm', int)
145+
ss = sn.extractsingle(regex, self.stdout, 'ss', int)
146+
return (hh*3600 + mm*60 + ss)

checks/libraries/dlaf/dlaf.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Copyright Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
import os
6+
import shutil
7+
import reframe as rfm
8+
import reframe.utility.sanity as sn
9+
10+
from uenv import uarch
11+
12+
dlaf_references = {
13+
"eigensolver": {
14+
"gh200": {
15+
"time_run": (30.5, -1.0, 0.1, "s"),
16+
}
17+
},
18+
"gen_eigensolver": {
19+
"gh200": {
20+
"time_run": (33.5, -1.0, 0.1, "s")
21+
}
22+
},
23+
}
24+
25+
slurm_config = {
26+
"eigensolver": {
27+
"gh200": {
28+
"nodes": 2,
29+
"ntasks-per-node": 4,
30+
"cpus-per-task": 72,
31+
"walltime": "0d0h5m0s",
32+
"gpu": True,
33+
}
34+
},
35+
"gen_eigensolver": {
36+
"gh200": {
37+
"nodes": 2,
38+
"ntasks-per-node": 4,
39+
"cpus-per-task": 72,
40+
"walltime": "0d0h5m0s",
41+
"gpu": True,
42+
}
43+
},
44+
}
45+
46+
47+
class dlaf_base(rfm.RunOnlyRegressionTest):
48+
valid_systems = ['+uenv']
49+
valid_prog_environs = ['+dlaf']
50+
maintainers = ["SSA"]
51+
52+
def _sq_factor(self, n):
53+
"""
54+
Finds two factors of `n` that are as close to each other as possible.
55+
Note: the second factor is larger or equal to the first factor
56+
"""
57+
for i in range(1, int(n**0.5) + 1):
58+
if n % i == 0:
59+
f = (i, n // i)
60+
return f
61+
62+
@run_before("run")
63+
def prepare_run(self):
64+
self.uarch = uarch(self.current_partition)
65+
config = slurm_config[self.test_name][self.uarch]
66+
self.job.options = [f'--nodes={config["nodes"]}']
67+
self.num_tasks_per_node = config["ntasks-per-node"]
68+
self.num_tasks = config["nodes"] * self.num_tasks_per_node
69+
self.num_cpus_per_task = config["cpus-per-task"]
70+
self.ntasks_per_core = 1
71+
self.time_limit = config["walltime"]
72+
self.job.launcher.options = ["--cpu-bind=cores"]
73+
if self.uarch == "gh200":
74+
self.job.launcher.options += ["--gpus-per-task=1"]
75+
76+
# environment variables
77+
self.env_vars["PIKA_THREADS"] = str(self.num_cpus_per_task - 1)
78+
self.env_vars["MIMALLOC_ALLOW_LARGE_OS_PAGES"] = "1"
79+
self.env_vars["MIMALLOC_EAGER_COMMIT_DELAY"] = "0"
80+
self.env_vars["FI_MR_CACHE_MONITOR"] = "disabled"
81+
if self.uarch == "gh200":
82+
self.env_vars["MPICH_GPU_SUPPORT_ENABLED"] = "1"
83+
self.env_vars["DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE"] = \
84+
"128"
85+
self.env_vars["DLAF_UMPIRE_DEVICE_MEMORY_POOL_ALIGNMENT_BYTES"] = \
86+
str(2**21)
87+
88+
grid_cols, grid_rows = self._sq_factor(self.num_tasks)
89+
self.executable_opts += [
90+
f"--grid-cols={grid_cols}",
91+
f"--grid-rows={grid_rows}"
92+
]
93+
94+
# set performance reference
95+
if self.uarch is not None and \
96+
self.uarch in dlaf_references[self.test_name]:
97+
self.reference = {
98+
self.current_partition.fullname:
99+
dlaf_references[self.test_name][self.uarch]
100+
}
101+
102+
@sanity_function
103+
def assert_job(self):
104+
"""
105+
[0] 29.7415s dL (40960, 40960) (0, 40960) (1024, 1024) 128 (4, 2) 71 GPU
106+
Max Diff / Max A: 1.19349e-13
107+
"""
108+
regex1 = r'^\[0\]\s+(?P<perf>\S+)s\s+'
109+
regex2 = r'^Max Diff / Max A.*: \S+'
110+
self.sanity_patterns = sn.all([
111+
sn.assert_found(regex1, self.stdout, msg='regex1 failed'),
112+
sn.assert_found(regex2, self.stdout, msg='regex2 failed')
113+
])
114+
115+
return self.sanity_patterns
116+
117+
@performance_function("s")
118+
def time_run(self):
119+
regex = r"^\[0\]\s+(?P<perf>\S+)s\s+"
120+
return sn.extractsingle(regex, self.stdout, "perf", float)
121+
122+
123+
@rfm.simple_test
124+
class dlaf_check_uenv(dlaf_base):
125+
tags = {"uenv", "production"}
126+
test_name = parameter(["gen_eigensolver", "eigensolver"])
127+
executable_opts = [
128+
"--type=d",
129+
"--matrix-size=40960",
130+
"--block-size=1024",
131+
"--check=last",
132+
"--nwarmups=1",
133+
"--nruns=1",
134+
]
135+
136+
@run_before("run")
137+
def set_executable(self):
138+
self.executable = (
139+
"miniapp_gen_eigensolver" if self.test_name == "gen_eigensolver"
140+
else "miniapp_eigensolver"
141+
)

0 commit comments

Comments
 (0)