Skip to content

Commit 8f5f123

Browse files
author
Vasileios Karakasis
authored
Merge branch 'master' into feature/emit-image-name-gen-ci
2 parents 4f8a351 + 5dffa31 commit 8f5f123

File tree

11 files changed

+302
-152
lines changed

11 files changed

+302
-152
lines changed

cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py

Lines changed: 59 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -7,138 +7,66 @@
77
import reframe.utility.sanity as sn
88

99

10-
class StridedBase(rfm.RegressionTest):
11-
def __init__(self):
12-
self.sourcepath = 'strides.cpp'
13-
self.build_system = 'SingleSource'
14-
self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
15-
'eiger:mc', 'pilatus:mc']
16-
self.valid_prog_environs = ['PrgEnv-gnu']
17-
self.num_tasks = 1
18-
self.num_tasks_per_node = 1
19-
20-
self.sanity_patterns = sn.assert_eq(
21-
sn.count(sn.findall(r'bandwidth', self.stdout)),
22-
self.num_tasks_assigned)
23-
24-
self.perf_patterns = {
25-
'bandwidth': sn.extractsingle(
26-
r'bandwidth: (?P<bw>\S+) GB/s',
27-
self.stdout, 'bw', float)
28-
}
29-
30-
self.system_num_cpus = {
31-
'daint:mc': 72,
32-
'daint:gpu': 24,
33-
'dom:mc': 72,
34-
'dom:gpu': 24,
35-
'eiger:mc': 128,
36-
'pilatus:mc': 128
37-
}
38-
39-
self.maintainers = ['SK']
40-
self.tags = {'benchmark', 'diagnostic'}
41-
42-
@property
43-
@sn.sanity_function
44-
def num_tasks_assigned(self):
45-
return self.job.num_tasks
46-
47-
4810
@rfm.simple_test
49-
class StridedBandwidthTest(StridedBase):
50-
def __init__(self):
51-
super().__init__()
52-
53-
self.reference = {
54-
'dom:gpu': {
55-
'bandwidth': (50, -0.1, 0.1, 'GB/s')
56-
},
57-
'dom:mc': {
58-
'bandwidth': (100, -0.1, 0.1, 'GB/s')
59-
},
60-
'daint:gpu': {
61-
'bandwidth': (50, -0.1, 0.1, 'GB/s')
62-
},
63-
'daint:mc': {
64-
'bandwidth': (100, -0.1, 0.1, 'GB/s')
65-
},
66-
'eiger:mc': {
67-
'bandwidth': (270, -0.1, 0.1, 'GB/s')
68-
},
69-
'pilatus:mc': {
70-
'bandwidth': (270, -0.1, 0.1, 'GB/s')
71-
}
72-
}
11+
class StridedBandwidthTest(rfm.RegressionTest):
12+
sourcepath = 'strides.cpp'
13+
build_system = 'SingleSource'
14+
valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
15+
'eiger:mc', 'pilatus:mc']
16+
valid_prog_environs = ['PrgEnv-gnu']
17+
num_tasks = 1
18+
num_tasks_per_node = 1
19+
maintainers = ['SK']
20+
tags = {'benchmark', 'diagnostic'}
21+
stride_bytes = parameter([8, 64, 128])
22+
reference_bw = {
23+
8: {
24+
'haswell': (50, -0.1, 0.1, 'GB/s'),
25+
'broadwell': (100, -0.1, 0.1, 'GB/s'),
26+
'zen2': (270, -0.1, 0.1, 'GB/s')
27+
},
28+
64: {
29+
'haswell': (6, -0.1, 0.2, 'GB/s'),
30+
'broadwell': (12.5, -0.1, 0.2, 'GB/s'),
31+
'zen2': (33, -0.1, 0.2, 'GB/s')
32+
},
33+
128: {
34+
'haswell': (4.5, -0.1, 0.2, 'GB/s'),
35+
'broadwell': (9.1, -0.1, 0.2, 'GB/s'),
36+
'zen2': (33, -0.1, 0.2, 'GB/s')
37+
},
38+
}
39+
40+
@run_after('setup')
41+
def skip_if_no_topo(self):
42+
proc = self.current_partition.processor
43+
pname = self.current_partition.fullname
44+
if not proc.info:
45+
self.skip(f'no topology information found for partition {pname!r}')
46+
47+
@sanity_function
48+
def assert_num_tasks(self):
49+
return sn.assert_eq(sn.count(sn.findall(r'bandwidth', self.stdout)),
50+
self.num_tasks)
51+
52+
@performance_function('GB/s')
53+
def bandwidth(self):
54+
return sn.extractsingle(r'bandwidth: (?P<bw>\S+) GB/s',
55+
self.stdout, 'bw', float)
7356

7457
@run_before('run')
7558
def set_exec_opts(self):
76-
self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
77-
78-
# 8-byte stride, using the full cacheline
79-
self.executable_opts = ['100000000', '1', f'{self.num_cpus}']
80-
81-
82-
@rfm.simple_test
83-
class StridedBandwidthTest64(StridedBase):
84-
def __init__(self):
85-
super().__init__()
86-
87-
self.reference = {
88-
'dom:gpu': {
89-
'bandwidth': (6, -0.1, 0.2, 'GB/s')
90-
},
91-
'dom:mc': {
92-
'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
93-
},
94-
'daint:gpu': {
95-
'bandwidth': (6, -0.05, 0.2, 'GB/s')
96-
},
97-
'daint:mc': {
98-
'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
99-
},
100-
'eiger:mc': {
101-
'bandwidth': (33, -0.1, 0.2, 'GB/s')
102-
},
103-
'pilatus:mc': {
104-
'bandwidth': (33, -0.1, 0.2, 'GB/s')
105-
}
106-
}
107-
108-
@run_before('run')
109-
def set_exec_opts(self):
110-
self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
111-
112-
# 64-byte stride, using 1/8 of the cacheline
113-
self.executable_opts = ['100000000', '8', '%s' % self.num_cpus]
114-
115-
116-
@rfm.simple_test
117-
class StridedBandwidthTest128(StridedBase):
118-
def __init__(self):
119-
super().__init__()
120-
121-
self.reference = {
122-
'dom:gpu': {
123-
'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
124-
},
125-
'dom:mc': {
126-
'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
127-
},
128-
'daint:gpu': {
129-
'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
130-
},
131-
'daint:mc': {
132-
'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
133-
},
134-
'eiger:mc': {
135-
'bandwidth': (33, -0.1, 0.2, 'GB/s')
136-
},
137-
}
138-
139-
@run_before('run')
140-
def set_exec_opts(self):
141-
self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
142-
143-
# 128-byte stride, using 1/8 of every 2nd cacheline
144-
self.executable_opts = ['100000000', '16', '%s' % self.num_cpus]
59+
proc = self.current_partition.processor
60+
self.executable_opts = [
61+
'100000000', str(self.stride_bytes // 8), f'{proc.num_cpus}'
62+
]
63+
64+
@run_before('performance')
65+
def set_reference(self):
66+
proc = self.current_partition.processor
67+
try:
68+
ref = self.reference_bw[self.stride_bytes][proc.arch]
69+
except KeyError:
70+
return
71+
else:
72+
self.reference = {'*': {'bandwidth': ref}}

docs/config_reference.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ System Partition Configuration
218218
If not, you should consider using the ``squeue`` backend below.
219219
- ``squeue``: Jobs will be launched using the `Slurm <https://www.schedmd.com/>`__ scheduler.
220220
This backend does not rely on job accounting to retrieve job statuses, but ReFrame does its best to query the job state as reliably as possible.
221+
- ``lsf``: Jobs will be launched using the `LSF <https://www.ibm.com/docs/en/spectrum-lsf/>`__ scheduler.
221222

222223
.. versionadded:: 3.7.2
223224
Support for the SGE scheduler is added.
@@ -276,6 +277,8 @@ System Partition Configuration
276277
This is a custom parallel program launcher used at `TACC <https://portal.tacc.utexas.edu/user-guides/stampede2>`__.
277278
- ``local``: No parallel program launcher will be used.
278279
The program will be launched locally.
280+
- ``lrun``: Parallel programs will be launched using `LC Launcher <https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#lrun>`__'s ``lrun`` command.
281+
- ``lrun-gpu``: Parallel programs will be launched using `LC Launcher <https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#lrun>`__'s ``lrun -M "-gpu"`` command that enables the CUDA-aware Spectrum MPI.
279282
- ``mpirun``: Parallel programs will be launched using the ``mpirun`` command.
280283
- ``mpiexec``: Parallel programs will be launched using the ``mpiexec`` command.
281284
- ``srun``: Parallel programs will be launched using `Slurm <https://slurm.schedmd.com/srun.html>`__'s ``srun`` command.

reframe/core/backends.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
'reframe.core.schedulers.local',
2121
'reframe.core.schedulers.slurm',
2222
'reframe.core.schedulers.pbs',
23-
'reframe.core.schedulers.sge'
23+
'reframe.core.schedulers.sge',
24+
'reframe.core.schedulers.lsf'
2425
]
2526
_schedulers = {}
2627

reframe/core/launchers/mpi.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,22 @@ def command(self, job):
124124
ret.append(opt)
125125

126126
return ret
127+
128+
129+
@register_launcher('lrun')
130+
class LrunLauncher(JobLauncher):
131+
'''LLNL's custom parallel job launcher'''
132+
133+
def command(self, job):
134+
num_tasks_per_node = job.num_tasks_per_node or 1
135+
num_nodes = job.num_tasks // num_tasks_per_node
136+
return ['lrun', '-N', str(num_nodes),
137+
'-T', str(num_tasks_per_node)]
138+
139+
140+
@register_launcher('lrun-gpu')
141+
class LrungpuLauncher(LrunLauncher):
142+
'''LLNL's custom parallel job launcher w/ CUDA aware Spectum MPI'''
143+
144+
def command(self, job):
145+
return super().command(job) + ['-M "-gpu"']

reframe/core/schedulers/lsf.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
#
7+
# LSF backend
8+
#
9+
# - Initial version submitted by Ryan Goodner, UNM (based on PBS backend)
10+
#
11+
12+
import functools
13+
import re
14+
import time
15+
16+
import reframe.core.runtime as rt
17+
import reframe.utility.osext as osext
18+
from reframe.core.backends import register_scheduler
19+
from reframe.core.exceptions import JobSchedulerError
20+
from reframe.core.schedulers.pbs import PbsJobScheduler
21+
22+
_run_strict = functools.partial(osext.run_command, check=True)
23+
24+
25+
@register_scheduler('lsf')
26+
class LsfJobScheduler(PbsJobScheduler):
27+
def __init__(self):
28+
self._prefix = '#BSUB'
29+
self._submit_timeout = rt.runtime().get_option(
30+
f'schedulers/@{self.registered_name}/job_submit_timeout'
31+
)
32+
33+
def emit_preamble(self, job):
34+
num_tasks_per_node = job.num_tasks_per_node or 1
35+
num_nodes = job.num_tasks // num_tasks_per_node
36+
37+
preamble = [
38+
self._format_option(f'-J {job.name}'),
39+
self._format_option(f'-o {job.stdout}'),
40+
self._format_option(f'-e {job.stderr}'),
41+
self._format_option(f'-nnodes {num_nodes}')
42+
]
43+
44+
# add job time limit in minutes
45+
if job.time_limit is not None:
46+
preamble.append(
47+
self._format_option(f'-W {int(job.time_limit // 60)}')
48+
)
49+
50+
# emit the rest of the options
51+
options = job.options + job.cli_options
52+
for opt in options:
53+
if opt.startswith('#'):
54+
preamble.append(opt)
55+
else:
56+
preamble.append(self._format_option(opt))
57+
58+
# change to working dir with cd
59+
preamble.append(f'cd {job.workdir}')
60+
61+
return preamble
62+
63+
def submit(self, job):
64+
cmd = f'bsub {job.script_filename}'
65+
completed = _run_strict(cmd, timeout=self._submit_timeout)
66+
jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted',
67+
completed.stdout)
68+
if not jobid_match:
69+
raise JobSchedulerError('could not retrieve the job id '
70+
'of the submitted job')
71+
72+
job._jobid = jobid_match.group('jobid')
73+
job._submit_time = time.time()
74+
75+
def poll(self, *jobs):
76+
if jobs:
77+
# filter out non-jobs
78+
jobs = [job for job in jobs if job is not None]
79+
80+
if not jobs:
81+
return
82+
83+
completed = _run_strict(
84+
f'bjobs -noheader {" ".join(job.jobid for job in jobs)}'
85+
)
86+
job_status = {}
87+
job_status_lines = completed.stdout.split('\n')
88+
89+
for line in job_status_lines:
90+
job_regex = (r'(?P<jobid>\d+)\s+'
91+
r'(?P<user>\S+)\s+'
92+
r'(?P<status>\S+)\s+'
93+
r'(?P<queue>\S+)')
94+
job_match = re.search(job_regex, line)
95+
if job_match:
96+
job_status[job_match['jobid']] = job_match['status']
97+
98+
for job in jobs:
99+
if job.jobid not in job_status:
100+
# job id not found
101+
self.log(f'Job {job.jobid} not known to scheduler, '
102+
f'assuming job completed')
103+
job._state = 'COMPLETED'
104+
job._completed = True
105+
elif job_status[job.jobid] in ('DONE', 'EXIT'):
106+
# job done
107+
job._state = 'COMPLETED'
108+
job._completed = True
109+
elif job_status[job.jobid] == 'RUN':
110+
# job running
111+
job._state = 'RUNNING'
112+
elif job_status[job.jobid] == 'PEND':
113+
# job pending
114+
job._state = 'PENDING'
115+
elif job_status[job.jobid] in ['PSUSP', 'SSUSP', 'USUSP']:
116+
# job suspended
117+
job._state = 'SUSPENDED'
118+
else:
119+
# job status unknown
120+
self.log(f'Job {job_status[job.jobid]} not known, '
121+
f'assuming job completed')
122+
job._state = 'COMPLETED'
123+
job._completed = True
124+
125+
def finished(self, job):
126+
if job.exception:
127+
raise job.exception
128+
129+
return job.state == 'COMPLETED'

reframe/core/schedulers/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,4 @@ def getscheduler(name):
4040
import reframe.core.schedulers.pbs # noqa: F401, F403
4141
import reframe.core.schedulers.sge # noqa: F401, F403
4242
import reframe.core.schedulers.torque # noqa: F401, F403
43+
import reframe.core.schedulers.lsf # noqa: F401, F403

0 commit comments

Comments
 (0)