Skip to content

Commit 5a3486b

Browse files
author
Vasileios Karakasis
authored
Merge branch 'master' into bugfix/autodetect_modules
2 parents fdcf262 + 5b17b95 commit 5a3486b

38 files changed

+1636
-403
lines changed

cscs-checks/apps/greasy/greasy_check.py

Lines changed: 84 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -16,79 +16,76 @@ def to_seconds(str):
1616
datetime.strptime('00:00:00', '%H:%M:%S')).total_seconds()
1717

1818

19-
@rfm.parameterized_test(
20-
['serial', 'gpu', 24, 12, 1, 1],
21-
['serial', 'mc', 72, 36, 1, 1],
22-
['openmp', 'gpu', 24, 3, 1, 4],
23-
['openmp', 'mc', 72, 9, 1, 4],
24-
['mpi', 'gpu', 24, 4, 3, 1],
25-
['mpi', 'mc', 72, 12, 3, 1],
26-
['mpi+openmp', 'gpu', 24, 3, 2, 2],
27-
['mpi+openmp', 'mc', 72, 6, 3, 2]
28-
)
19+
@rfm.simple_test
2920
class GREASYCheck(rfm.RegressionTest):
30-
def __init__(self, variant, partition, num_greasy_tasks, nworkes_per_node,
31-
nranks_per_worker, ncpus_per_worker):
32-
self.valid_systems = ['daint:' + partition, 'dom:' + partition]
21+
configuration = parameter([('serial', 'gpu', 24, 12, 1, 1),
22+
('serial', 'mc', 72, 36, 1, 1),
23+
('openmp', 'gpu', 24, 3, 1, 4),
24+
('openmp', 'mc', 72, 9, 1, 4),
25+
('mpi', 'gpu', 24, 4, 3, 1),
26+
('mpi', 'mc', 72, 12, 3, 1),
27+
('mpi+openmp', 'gpu', 24, 3, 2, 2),
28+
('mpi+openmp', 'mc', 72, 6, 3, 2)])
29+
variant = variable(str)
30+
partition = variable(str)
31+
num_greasy_tasks = variable(int)
32+
workers_per_node = variable(int)
33+
ranks_per_worker = variable(int)
34+
cpus_per_worker = variable(int)
35+
valid_prog_environs = ['PrgEnv-gnu']
36+
sourcepath = 'tasks_mpi_openmp.c'
37+
build_system = 'SingleSource'
38+
executable = 'tasks_mpi_openmp.x'
39+
tasks_file = variable(str, value='tasks.txt')
40+
greasy_logfile = variable(str, value='greasy.log')
41+
nnodes = variable(int, value=2)
3342

34-
self.valid_prog_environs = ['PrgEnv-gnu']
35-
self.sourcepath = 'tasks_mpi_openmp.c'
36-
self.build_system = 'SingleSource'
43+
# sleep enough time to distinguish if the files are running in parallel
44+
# or not
45+
sleep_time = variable(int, value=60)
46+
use_multithreading = False
47+
modules = ['GREASY']
48+
maintainers = ['VH', 'SK']
49+
tags = {'production'}
3750

38-
# sleep enough time to distinguish if the files are running in parallel
39-
# or not
40-
self.sleep_time = 60
51+
@run_after('init')
52+
def unpack_configuration_parameter(self):
53+
self.variant, self.partition = self.configuration[0:2]
54+
self.num_greasy_tasks, self.workers_per_node = self.configuration[2:4]
55+
self.ranks_per_worker, self.cpus_per_worker = self.configuration[4:6]
56+
57+
@run_after('init')
58+
def set_valid_systems(self):
59+
self.valid_systems = [f'daint:{self.partition}',
60+
f'dom:{self.partition}']
61+
62+
@run_before('compile')
63+
def setup_build_system(self):
4164
self.build_system.cflags = [f'-DSLEEP_TIME={self.sleep_time:d}']
42-
self.variant = variant
43-
if variant == 'openmp':
65+
if self.variant == 'openmp':
4466
self.build_system.cflags += ['-fopenmp']
45-
elif variant == 'mpi':
67+
elif self.variant == 'mpi':
4668
self.build_system.cflags += ['-D_MPI']
47-
elif variant == 'mpi+openmp':
69+
elif self.variant == 'mpi+openmp':
4870
self.build_system.cflags += ['-fopenmp', '-D_MPI']
4971

50-
self.executable = 'tasks_mpi_openmp.x'
51-
self.tasks_file = 'tasks.txt'
72+
@run_before('run')
73+
def setup_greasy_run(self):
5274
self.executable_opts = [self.tasks_file]
53-
self.greasy_logfile = 'greasy.log'
5475
self.keep_files = [self.tasks_file, self.greasy_logfile]
55-
nnodes = 2
56-
self.use_multithreading = False
57-
self.num_greasy_tasks = num_greasy_tasks
58-
self.nworkes_per_node = nworkes_per_node
59-
self.nranks_per_worker = nranks_per_worker
60-
self.num_tasks_per_node = nranks_per_worker * nworkes_per_node
61-
self.num_tasks = self.num_tasks_per_node * nnodes
62-
self.num_cpus_per_task = ncpus_per_worker
63-
self.sanity_patterns = self.eval_sanity()
76+
self.num_tasks_per_node = self.ranks_per_worker * self.workers_per_node
77+
self.num_tasks = self.num_tasks_per_node * self.nnodes
78+
self.num_cpus_per_task = self.cpus_per_worker
6479

65-
# Reference value is system agnostic
66-
# Adding 10 secs of slowdown per greasy tasks
67-
# this is to compensate for whenever the systems are full and srun gets
68-
# slightly slower
69-
refperf = (
70-
(self.sleep_time+10)*num_greasy_tasks / nworkes_per_node / nnodes
71-
)
72-
self.reference = {
73-
'*': {
74-
'time': (refperf, None, 0.5, 's')
75-
}
76-
}
77-
self.perf_patterns = {
78-
'time': sn.extractsingle(r'Total time: (?P<perf>\S+)',
79-
self.greasy_logfile,
80-
'perf', to_seconds)
81-
}
80+
@run_before('run')
81+
def set_environment_variables(self):
8282
# On SLURM there is no need to set OMP_NUM_THREADS if one defines
8383
# num_cpus_per_task, but adding for completeness and portability
8484
self.variables = {
8585
'OMP_NUM_THREADS': str(self.num_cpus_per_task),
86-
'GREASY_NWORKERS_PER_NODE': str(nworkes_per_node),
86+
'GREASY_NWORKERS_PER_NODE': str(self.workers_per_node),
8787
'GREASY_LOGFILE': self.greasy_logfile
8888
}
89-
self.modules = ['GREASY']
90-
self.maintainers = ['VH', 'SK']
91-
self.tags = {'production'}
9289

9390
@run_before('run')
9491
def generate_tasks_file(self):
@@ -114,7 +111,7 @@ def daint_dom_gpu_specific_workaround(self):
114111
}
115112
}
116113
elif self.current_partition.fullname in ['daint:mc']:
117-
if self.variant != 'serial':
114+
if 'serial' not in self.variant:
118115
self.extra_resources = {
119116
'gres': {
120117
'gres': 'craynetwork:72'
@@ -133,17 +130,19 @@ def set_launcher(self):
133130
# make calls to srun
134131
self.job.launcher = getlauncher('local')()
135132

136-
@sn.sanity_function
137-
def eval_sanity(self):
133+
@sanity_function
134+
def assert_success(self):
138135
output_files = []
139136
output_files = [file for file in os.listdir(self.stagedir)
140137
if file.startswith('output-')]
141138
num_greasy_tasks = len(output_files)
142139
failure_msg = (f'Requested {self.num_greasy_tasks} task(s), but '
143140
f'executed only {num_greasy_tasks} tasks(s)')
144-
sn.evaluate(sn.assert_eq(num_greasy_tasks, self.num_greasy_tasks,
145-
msg=failure_msg))
146-
num_tasks = sn.getattr(self, 'nranks_per_worker')
141+
sn.evaluate(
142+
sn.assert_eq(num_greasy_tasks, self.num_greasy_tasks,
143+
msg=failure_msg)
144+
)
145+
num_tasks = sn.getattr(self, 'ranks_per_worker')
147146
num_cpus_per_task = sn.getattr(self, 'num_cpus_per_task')
148147

149148
def tid(match):
@@ -184,7 +183,7 @@ def num_ranks(match):
184183
lambda x: sn.assert_lt(
185184
rank(x), num_ranks(x),
186185
msg=(f'Rank id {rank(x)} is not lower than the '
187-
f'number of ranks {self.nranks_per_worker} '
186+
f'number of ranks {self.ranks_per_worker} '
188187
f'in output file')
189188
), result
190189
),
@@ -217,7 +216,7 @@ def num_ranks(match):
217216
lambda x: sn.assert_eq(
218217
num_ranks(x), num_tasks,
219218
msg=(f'Number of ranks {num_ranks(x)} is not '
220-
f'equal to {self.nranks_per_worker} in '
219+
f'equal to {self.ranks_per_worker} in '
221220
f'output file {output_file}')
222221
), result
223222
)
@@ -234,3 +233,24 @@ def num_ranks(match):
234233
))
235234

236235
return True
236+
237+
@run_before('performance')
238+
def set_reference(self):
239+
# Reference value is system agnostic
240+
# Adding 10 secs of slowdown per greasy tasks
241+
# this is to compensate for whenever the systems are full and srun gets
242+
# slightly slower
243+
refperf = (
244+
(self.sleep_time + 10) * self.num_greasy_tasks /
245+
self.workers_per_node / self.nnodes
246+
)
247+
self.reference = {
248+
'*': {
249+
'time': (refperf, None, 0.5, 's')
250+
}
251+
}
252+
253+
@performance_function('s')
254+
def time(self):
255+
return sn.extractsingle(r'Total time: (?P<perf>\S+)',
256+
self.greasy_logfile, 'perf', to_seconds)

cscs-checks/libraries/petsc/petsc_helloworld.py

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,30 +7,39 @@
77
import reframe.utility.sanity as sn
88

99

10-
@rfm.parameterized_test(['dynamic'], ['static'])
10+
@rfm.simple_test
1111
class PetscPoisson2DCheck(rfm.RegressionTest):
12-
def __init__(self, linkage):
12+
linkage = parameter(['dynamic', 'static'])
13+
valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
14+
valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel']
15+
modules = ['cray-petsc']
16+
build_system = 'SingleSource'
17+
sourcepath = 'poisson2d.c'
18+
num_tasks = 16
19+
num_tasks_per_node = 8
20+
executable_opts = ['-da_grid_x 4', '-da_grid_y 4', '-ksp_monitor']
21+
tags = {'production', 'craype'}
22+
maintainers = ['AJ', 'CB']
23+
24+
@run_after('init')
25+
def set_description(self):
1326
self.descr = (f'Compile/run PETSc 2D Poisson example with cray-petsc '
14-
f'({linkage} linking)')
15-
self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
16-
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
17-
'PrgEnv-intel']
18-
self.sourcepath = 'poisson2d.c'
19-
self.modules = ['cray-petsc']
20-
self.num_tasks = 16
21-
self.num_tasks_per_node = 8
22-
self.build_system = 'SingleSource'
27+
f'({self.linkage} linking)')
28+
29+
@run_before('compile')
30+
def set_variables(self):
31+
self.variables = {'CRAYPE_LINK_TYPE': self.linkage}
32+
33+
@run_before('compile')
34+
def intel_workaround(self):
2335
# FIXME: static compilation yields a link error in case of
2436
# PrgEnv-intel (Cray Bug #255701) workaround use C++ compiler
25-
if linkage == 'static':
37+
if self.linkage == 'static':
2638
self.build_system.cc = 'CC'
2739

28-
self.variables = {'CRAYPE_LINK_TYPE': linkage}
29-
self.executable_opts = ['-da_grid_x 4', '-da_grid_y 4', '-ksp_monitor']
30-
40+
@sanity_function
41+
def assert_convergence(self):
3142
# Check the final residual norm for convergence
3243
norm = sn.extractsingle(r'\s+\d+\s+KSP Residual norm\s+(?P<norm>\S+)',
3344
self.stdout, 'norm', float, -1)
34-
self.sanity_patterns = sn.assert_lt(norm, 1.0e-5)
35-
self.tags = {'production', 'craype'}
36-
self.maintainers = ['AJ', 'CB']
45+
return sn.assert_lt(norm, 1.0e-5)

cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
55

6-
import os
76

87
import reframe as rfm
98
import reframe.utility.sanity as sn
9+
import reframe.utility.osext as osext
10+
from reframe.core.exceptions import SanityError
1011

1112
from hpctestlib.microbenchmarks.gpu.gpu_burn import GpuBurn
1213
import cscstests.microbenchmarks.gpu.hooks as hooks
@@ -24,30 +25,29 @@ class gpu_burn_check(GpuBurn):
2425
num_tasks = 0
2526
reference = {
2627
'dom:gpu': {
27-
'perf': (4115, -0.10, None, 'Gflop/s'),
28+
'min_perf': (4115, -0.10, None, 'Gflop/s'),
2829
},
2930
'daint:gpu': {
30-
'perf': (4115, -0.10, None, 'Gflop/s'),
31+
'min_perf': (4115, -0.10, None, 'Gflop/s'),
3132
},
3233
'arolla:cn': {
33-
'perf': (5861, -0.10, None, 'Gflop/s'),
34+
'min_perf': (5861, -0.10, None, 'Gflop/s'),
3435
},
3536
'tsa:cn': {
36-
'perf': (5861, -0.10, None, 'Gflop/s'),
37+
'min_perf': (5861, -0.10, None, 'Gflop/s'),
3738
},
3839
'ault:amda100': {
39-
'perf': (15000, -0.10, None, 'Gflop/s'),
40+
'min_perf': (15000, -0.10, None, 'Gflop/s'),
4041
},
4142
'ault:amdv100': {
42-
'perf': (5500, -0.10, None, 'Gflop/s'),
43+
'min_perf': (5500, -0.10, None, 'Gflop/s'),
4344
},
4445
'ault:intelv100': {
45-
'perf': (5500, -0.10, None, 'Gflop/s'),
46+
'min_perf': (5500, -0.10, None, 'Gflop/s'),
4647
},
4748
'ault:amdvega': {
48-
'perf': (3450, -0.10, None, 'Gflop/s'),
49+
'min_perf': (3450, -0.10, None, 'Gflop/s'),
4950
},
50-
'*': {'temp': (0, None, None, 'degC')}
5151
}
5252

5353
maintainers = ['AJ', 'TM']
@@ -63,16 +63,25 @@ def set_num_gpus_per_node(self):
6363
hooks.set_num_gpus_per_node(self)
6464

6565
@run_before('performance')
66-
def report_nid_with_smallest_flops(self):
67-
regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s'
68-
rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout))
69-
self.nids = sn.extractall(regex, rptf, 1)
70-
self.flops = sn.extractall(regex, rptf, 2, float)
66+
def report_slow_nodes(self):
67+
'''Report the base perf metrics and also all the slow nodes.'''
68+
69+
# Only report the nodes that don't meet the perf reference
70+
with osext.change_dir(self.stagedir):
71+
key = f'{self.current_partition.fullname}:min_perf'
72+
if key in self.reference:
73+
regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s'
74+
nids = set(sn.extractall(regex, self.stdout, 1))
75+
76+
# Get the references
77+
ref, lt, ut, *_ = self.reference[key]
78+
79+
# Flag the slow nodes
80+
for nid in nids:
81+
try:
82+
node_perf = self.min_perf(nid)
83+
val = node_perf.evaluate(cache=True)
84+
sn.assert_reference(val, ref, lt, ut).evaluate()
85+
except SanityError:
86+
self.perf_variables[nid] = node_perf
7187

72-
# Find index of smallest flops and update reference dictionary to
73-
# include our patched units
74-
index = self.flops.evaluate().index(min(self.flops))
75-
unit = f'GF/s ({self.nids[index]})'
76-
for key, ref in self.reference.items():
77-
if not key.endswith(':temp'):
78-
self.reference[key] = (*ref[:3], unit)

0 commit comments

Comments
 (0)