Skip to content

Commit cac4606

Browse files
author
Vasileios Karakasis
authored
Merge branch 'master' into bugfix/autodetect_processor_info
2 parents 28a09e9 + 7d6b831 commit cac4606

File tree

8 files changed

+816
-794
lines changed

8 files changed

+816
-794
lines changed

cscs-checks/apps/jupyter/check_ipcmagic.py

Lines changed: 70 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -11,55 +11,79 @@
1111

1212
@rfm.simple_test
1313
class IPCMagicCheck(rfm.RunOnlyRegressionTest):
14-
def __init__(self):
15-
self.descr = 'Distributed training with TensorFlow using ipyparallel'
16-
self.valid_systems = ['daint:gpu', 'dom:gpu']
17-
self.valid_prog_environs = ['PrgEnv-gnu']
18-
self.modules = [
19-
# FIXME: Use the default ipcmagic version when fixed
20-
f'ipcmagic/0.1-CrayGNU-{osext.cray_cdt_version()}',
21-
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
22-
]
23-
self.num_tasks = 2
24-
self.num_tasks_per_node = 1
25-
self.executable = 'ipython'
26-
self.executable_opts = ['tf-hvd-sgd-ipc-tf2.py']
27-
nids = sn.extractall(r'nid(?P<nid>\d+)',
28-
self.stdout, 'nid', str)
29-
self.sanity_patterns = sn.all([
30-
sn.assert_ne(nids, []),
31-
sn.assert_ne(nids[0], nids[1])
32-
])
33-
self.reference = {
34-
'daint:gpu': {
35-
'slope': (2.0, -0.1, 0.1, None),
36-
'offset': (0.0, -0.1, 0.1, None),
37-
'retries': (0, None, None, None),
38-
'time': (10, None, None, 's'),
39-
},
40-
'dom:gpu': {
41-
'slope': (2.0, -0.1, 0.1, None),
42-
'offset': (0.0, -0.1, 0.1, None),
43-
'retries': (0, None, None, None),
44-
'time': (10, None, None, 's'),
45-
}
46-
}
47-
self.perf_patterns = {
48-
'slope': sn.extractsingle(r'slope=(?P<slope>\S+)',
49-
self.stdout, 'slope', float),
50-
'offset': sn.extractsingle(r'offset=(?P<offset>\S+)',
51-
self.stdout, 'offset', float),
52-
'retries': 4 - sn.count(sn.findall(r'IPCluster is already running',
53-
self.stdout)),
54-
'time': sn.extractsingle(r'IPCluster is ready\!\s+'
55-
r'\((?P<time>\d+) seconds\)',
56-
self.stdout, 'time', float)
14+
descr = 'Distributed training with TensorFlow using ipyparallel'
15+
valid_systems = ['daint:gpu', 'dom:gpu']
16+
valid_prog_environs = ['PrgEnv-gnu']
17+
modules = [
18+
f'ipcmagic', f'jupyterlab',
19+
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
20+
]
21+
num_tasks = 2
22+
num_tasks_per_node = 1
23+
executable = 'ipython'
24+
executable_opts = ['tf-hvd-sgd-ipc-tf2.py']
25+
reference = {
26+
'daint:gpu': {
27+
'slope': (2.0, -0.1, 0.1, 'N/A'),
28+
'offset': (0.0, -0.1, 0.1, 'N/A'),
29+
'retries': (0, None, None, 'N/A'),
30+
'time': (10, None, None, 's'),
31+
},
32+
'dom:gpu': {
33+
'slope': (2.0, -0.1, 0.1, 'N/A'),
34+
'offset': (0.0, -0.1, 0.1, 'N/A'),
35+
'retries': (0, None, None, 'N/A'),
36+
'time': (10, None, None, 's'),
5737
}
58-
self.maintainers = ['RS', 'TR']
59-
self.tags = {'production'}
38+
}
39+
40+
maintainers = ['RS', 'TR']
41+
tags = {'production'}
42+
43+
@run_after('setup')
44+
def daint_module_workaround(self):
45+
if self.current_system.name == 'daint':
46+
# FIXME: Use the default modules once Dom/Daint are aligned
47+
self.modules = [
48+
f'ipcmagic/1.0.1-CrayGNU-{osext.cray_cdt_version()}',
49+
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
50+
]
51+
# FIXME: Enforce loading of jupyterlab module since
52+
# `module show jupyterlab` throws a Tcl error on Daint
53+
self.prerun_cmds = ['module load jupyterlab']
54+
55+
@sanity_function
56+
def assert_successful_execution(self):
57+
nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str)
58+
return sn.all([
59+
sn.assert_ne(nids, []), sn.assert_ne(nids[0], nids[1]),
60+
sn.assert_found(r'IPCluster is ready\!\s+', self.stdout),
61+
sn.assert_found(r'slope=\S+', self.stdout)
62+
])
63+
64+
@performance_function('N/A')
65+
def slope(self):
66+
return sn.extractsingle(r'slope=(?P<slope>\S+)', self.stdout,
67+
'slope', float)
68+
69+
@performance_function('N/A')
70+
def offset(self):
71+
return sn.extractsingle(r'offset=(?P<offset>\S+)', self.stdout,
72+
'offset', float)
73+
74+
@performance_function('N/A')
75+
def retries(self):
76+
return 4 - sn.count(sn.findall(r'IPCluster is already running',
77+
self.stdout))
78+
79+
@performance_function('s')
80+
def time(self):
81+
return sn.extractsingle(r'IPCluster is ready\!\s+'
82+
r'\((?P<time>\d+) seconds\)',
83+
self.stdout, 'time', float)
6084

6185
@run_before('run')
62-
def prepare_run(self):
86+
def reset_launcher(self):
6387
# Change the job launcher since `ipython`
6488
# needs to be launched without `srun`.
6589
self.job.launcher = getlauncher('local')()

cscs-checks/apps/namd/namd_check.py

Lines changed: 93 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,62 @@
99
import reframe.utility.sanity as sn
1010

1111

12-
class NamdBaseCheck(rfm.RunOnlyRegressionTest):
13-
def __init__(self, arch, scale, variant):
14-
self.descr = f'NAMD check ({arch}, {variant})'
15-
if self.current_system.name in ['eiger', 'pilatus']:
16-
self.valid_prog_environs = ['cpeGNU']
17-
else:
18-
self.valid_prog_environs = ['builtin']
12+
@rfm.simple_test
13+
class NamdCheck(rfm.RunOnlyRegressionTest):
14+
scale = parameter(['small', 'large'])
15+
variant = parameter(['maint', 'prod'])
16+
arch = parameter(['gpu', 'cpu'])
1917

20-
self.modules = ['NAMD']
18+
valid_prog_environs = ['builtin', 'cpeGNU']
19+
modules = ['NAMD']
20+
executable = 'namd2'
21+
use_multithreading = True
22+
num_tasks_per_core = 2
23+
maintainers = ['CB', 'LM']
24+
tags = {'scs', 'external-resources'}
25+
extra_resources = {
26+
'switches': {
27+
'num_switches': 1
28+
}
29+
}
2130

22-
# Reset sources dir relative to the SCS apps prefix
23-
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
24-
'NAMD', 'prod')
25-
self.executable = 'namd2'
26-
self.use_multithreading = True
27-
self.num_tasks_per_core = 2
31+
@run_after('init')
32+
def adapt_description(self):
33+
self.descr = f'NAMD check ({self.arch}, {self.variant})'
34+
self.tags |= {
35+
'maintenance' if self.variant == 'maint' else 'production'
36+
}
37+
38+
@run_after('init')
39+
def adapt_valid_systems(self):
40+
if self.arch == 'gpu':
41+
self.valid_systems = ['daint:gpu']
42+
if self.scale == 'small':
43+
self.valid_systems += ['dom:gpu']
44+
else:
45+
self.valid_systems = ['daint:mc', 'eiger:mc', 'pilatus:mc']
46+
if self.scale == 'small':
47+
self.valid_systems += ['dom:mc']
48+
49+
@run_after('init')
50+
def adapt_valid_prog_environs(self):
51+
if self.current_system.name in ['eiger', 'pilatus']:
52+
self.valid_prog_environs.remove('builtin')
2853

29-
if scale == 'small':
54+
@run_after('init')
55+
def setup_parallel_run(self):
56+
if self.arch == 'gpu':
57+
self.executable_opts = ['+idlepoll', '+ppn 23', 'stmv.namd']
58+
self.num_cpus_per_task = 24
59+
self.num_gpus_per_node = 1
60+
else:
61+
# On Eiger a no-smp NAMD version is the default
62+
if self.current_system.name in ['eiger', 'pilatus']:
63+
self.executable_opts = ['+idlepoll', 'stmv.namd']
64+
else:
65+
self.executable_opts = ['+idlepoll', '+ppn 71', 'stmv.namd']
66+
self.num_cpus_per_task = 72
67+
if self.scale == 'small':
3068
# On Eiger a no-smp NAMD version is the default
3169
if self.current_system.name in ['eiger', 'pilatus']:
3270
self.num_tasks = 768
@@ -42,85 +80,57 @@ def __init__(self, arch, scale, variant):
4280
self.num_tasks = 16
4381
self.num_tasks_per_node = 1
4482

83+
@run_before('compile')
84+
def prepare_build(self):
85+
# Reset sources dir relative to the SCS apps prefix
86+
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
87+
'NAMD', 'prod')
88+
89+
@sanity_function
90+
def validate_energy(self):
4591
energy = sn.avg(sn.extractall(
4692
r'ENERGY:([ \t]+\S+){10}[ \t]+(?P<energy>\S+)',
4793
self.stdout, 'energy', float)
4894
)
4995
energy_reference = -2451359.5
5096
energy_diff = sn.abs(energy - energy_reference)
51-
self.sanity_patterns = sn.all([
97+
return sn.all([
5298
sn.assert_eq(sn.count(sn.extractall(
5399
r'TIMING: (?P<step_num>\S+) CPU:',
54100
self.stdout, 'step_num')), 50),
55101
sn.assert_lt(energy_diff, 2720)
56102
])
57103

58-
self.perf_patterns = {
59-
'days_ns': sn.avg(sn.extractall(
60-
r'Info: Benchmark time: \S+ CPUs \S+ '
61-
r's/step (?P<days_ns>\S+) days/ns \S+ MB memory',
62-
self.stdout, 'days_ns', float))
63-
}
64-
65-
self.maintainers = ['CB', 'LM']
66-
self.tags = {'scs', 'external-resources'}
67-
self.extra_resources = {
68-
'switches': {
69-
'num_switches': 1
70-
}
71-
}
72-
73-
74-
@rfm.parameterized_test(*([s, v]
75-
for s in ['small', 'large']
76-
for v in ['maint', 'prod']))
77-
class NamdGPUCheck(NamdBaseCheck):
78-
def __init__(self, scale, variant):
79-
super().__init__('gpu', scale, variant)
80-
self.valid_systems = ['daint:gpu']
81-
self.executable_opts = ['+idlepoll', '+ppn 23', 'stmv.namd']
82-
self.num_cpus_per_task = 24
83-
self.num_gpus_per_node = 1
84-
self.tags |= {'maintenance' if variant == 'maint' else 'production'}
85-
if scale == 'small':
86-
self.valid_systems += ['dom:gpu']
87-
self.reference = {
88-
'dom:gpu': {'days_ns': (0.15, None, 0.05, 'days/ns')},
89-
'daint:gpu': {'days_ns': (0.15, None, 0.05, 'days/ns')}
90-
}
91-
else:
92-
self.reference = {
93-
'daint:gpu': {'days_ns': (0.07, None, 0.05, 'days/ns')}
94-
}
95-
96-
97-
@rfm.parameterized_test(*([s, v]
98-
for s in ['small', 'large']
99-
for v in ['maint', 'prod']))
100-
class NamdCPUCheck(NamdBaseCheck):
101-
def __init__(self, scale, variant):
102-
super().__init__('cpu', scale, variant)
103-
self.valid_systems = ['daint:mc', 'eiger:mc', 'pilatus:mc']
104-
# On Eiger a no-smp NAMD version is the default
105-
if self.current_system.name in ['eiger', 'pilatus']:
106-
self.executable_opts = ['+idlepoll', 'stmv.namd']
107-
self.num_tasks_per_core = 2
108-
else:
109-
self.executable_opts = ['+idlepoll', '+ppn 71', 'stmv.namd']
110-
self.num_cpus_per_task = 72
111-
if scale == 'small':
112-
self.valid_systems += ['dom:mc']
113-
self.reference = {
114-
'dom:mc': {'days_ns': (0.51, None, 0.05, 'days/ns')},
115-
'daint:mc': {'days_ns': (0.51, None, 0.05, 'days/ns')},
116-
'eiger:mc': {'days_ns': (0.12, None, 0.05, 'days/ns')},
117-
'pilatus:mc': {'days_ns': (0.12, None, 0.05, 'days/ns')},
118-
}
104+
@run_before('performance')
105+
def set_reference(self):
106+
if self.arch == 'gpu':
107+
if self.scale == 'small':
108+
self.reference = {
109+
'dom:gpu': {'days_ns': (0.15, None, 0.05, 'days/ns')},
110+
'daint:gpu': {'days_ns': (0.15, None, 0.05, 'days/ns')}
111+
}
112+
else:
113+
self.reference = {
114+
'daint:gpu': {'days_ns': (0.07, None, 0.05, 'days/ns')}
115+
}
119116
else:
120-
self.reference = {
121-
'daint:mc': {'days_ns': (0.28, None, 0.05, 'days/ns')},
122-
'eiger:mc': {'days_ns': (0.05, None, 0.05, 'days/ns')},
123-
'pilatus:mc': {'days_ns': (0.05, None, 0.05, 'days/ns')}
124-
}
117+
if self.scale == 'small':
118+
self.reference = {
119+
'dom:mc': {'days_ns': (0.51, None, 0.05, 'days/ns')},
120+
'daint:mc': {'days_ns': (0.51, None, 0.05, 'days/ns')},
121+
'eiger:mc': {'days_ns': (0.12, None, 0.05, 'days/ns')},
122+
'pilatus:mc': {'days_ns': (0.12, None, 0.05, 'days/ns')},
123+
}
124+
else:
125+
self.reference = {
126+
'daint:mc': {'days_ns': (0.28, None, 0.05, 'days/ns')},
127+
'eiger:mc': {'days_ns': (0.05, None, 0.05, 'days/ns')},
128+
'pilatus:mc': {'days_ns': (0.05, None, 0.05, 'days/ns')}
129+
}
125130

126-
self.tags |= {'maintenance' if variant == 'maint' else 'production'}
131+
@performance_function('days/ns')
132+
def days_ns(self):
133+
return sn.avg(sn.extractall(
134+
r'Info: Benchmark time: \S+ CPUs \S+ '
135+
r's/step (?P<days_ns>\S+) days/ns \S+ MB memory',
136+
self.stdout, 'days_ns', float))

cscs-checks/apps/quantumespresso/quantumespresso_check.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ def __init__(self):
1919

2020
self.modules = ['QuantumESPRESSO']
2121
self.executable = 'pw.x'
22-
self.executable_opts = ['-in', 'ausurf.in']
22+
self.executable_opts = ['-in', 'ausurf.in',
23+
'-pd', '.true.']
2324

2425
self.sanity_patterns = sn.all([
2526
sn.assert_found(r'convergence has been achieved', self.stdout),

0 commit comments

Comments
 (0)