Skip to content

Commit 5dffa31

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2133 from rngoodner/lsf
[feat] Add support for the LSF scheduler and LLNL's `lrun` parallel launcher
2 parents 7922c5e + 2158c15 commit 5dffa31

File tree

8 files changed

+186
-7
lines changed

8 files changed

+186
-7
lines changed

docs/config_reference.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ System Partition Configuration
218218
If not, you should consider using the ``squeue`` backend below.
219219
- ``squeue``: Jobs will be launched using the `Slurm <https://www.schedmd.com/>`__ scheduler.
220220
This backend does not rely on job accounting to retrieve job statuses, but ReFrame does its best to query the job state as reliably as possible.
221+
- ``lsf``: Jobs will be launched using the `LSF <https://www.ibm.com/docs/en/spectrum-lsf/>`__ scheduler.
221222

222223
.. versionadded:: 3.7.2
223224
Support for the SGE scheduler is added.
@@ -276,6 +277,8 @@ System Partition Configuration
276277
This is a custom parallel program launcher used at `TACC <https://portal.tacc.utexas.edu/user-guides/stampede2>`__.
277278
- ``local``: No parallel program launcher will be used.
278279
The program will be launched locally.
280+
- ``lrun``: Parallel programs will be launched using `LC Launcher <https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#lrun>`__'s ``lrun`` command.
281+
- ``lrun-gpu``: Parallel programs will be launched using `LC Launcher <https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#lrun>`__'s ``lrun -M "-gpu"`` command that enables the CUDA-aware Spectrum MPI.
279282
- ``mpirun``: Parallel programs will be launched using the ``mpirun`` command.
280283
- ``mpiexec``: Parallel programs will be launched using the ``mpiexec`` command.
281284
- ``srun``: Parallel programs will be launched using `Slurm <https://slurm.schedmd.com/srun.html>`__'s ``srun`` command.

reframe/core/backends.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
'reframe.core.schedulers.local',
2121
'reframe.core.schedulers.slurm',
2222
'reframe.core.schedulers.pbs',
23-
'reframe.core.schedulers.sge'
23+
'reframe.core.schedulers.sge',
24+
'reframe.core.schedulers.lsf'
2425
]
2526
_schedulers = {}
2627

reframe/core/launchers/mpi.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,22 @@ def command(self, job):
124124
ret.append(opt)
125125

126126
return ret
127+
128+
129+
@register_launcher('lrun')
130+
class LrunLauncher(JobLauncher):
131+
'''LLNL's custom parallel job launcher'''
132+
133+
def command(self, job):
134+
num_tasks_per_node = job.num_tasks_per_node or 1
135+
num_nodes = job.num_tasks // num_tasks_per_node
136+
return ['lrun', '-N', str(num_nodes),
137+
'-T', str(num_tasks_per_node)]
138+
139+
140+
@register_launcher('lrun-gpu')
141+
class LrungpuLauncher(LrunLauncher):
142+
'''LLNL's custom parallel job launcher w/ CUDA aware Spectum MPI'''
143+
144+
def command(self, job):
145+
return super().command(job) + ['-M "-gpu"']

reframe/core/schedulers/lsf.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
#
7+
# LSF backend
8+
#
9+
# - Initial version submitted by Ryan Goodner, UNM (based on PBS backend)
10+
#
11+
12+
import functools
13+
import re
14+
import time
15+
16+
import reframe.core.runtime as rt
17+
import reframe.utility.osext as osext
18+
from reframe.core.backends import register_scheduler
19+
from reframe.core.exceptions import JobSchedulerError
20+
from reframe.core.schedulers.pbs import PbsJobScheduler
21+
22+
_run_strict = functools.partial(osext.run_command, check=True)
23+
24+
25+
@register_scheduler('lsf')
26+
class LsfJobScheduler(PbsJobScheduler):
27+
def __init__(self):
28+
self._prefix = '#BSUB'
29+
self._submit_timeout = rt.runtime().get_option(
30+
f'schedulers/@{self.registered_name}/job_submit_timeout'
31+
)
32+
33+
def emit_preamble(self, job):
34+
num_tasks_per_node = job.num_tasks_per_node or 1
35+
num_nodes = job.num_tasks // num_tasks_per_node
36+
37+
preamble = [
38+
self._format_option(f'-J {job.name}'),
39+
self._format_option(f'-o {job.stdout}'),
40+
self._format_option(f'-e {job.stderr}'),
41+
self._format_option(f'-nnodes {num_nodes}')
42+
]
43+
44+
# add job time limit in minutes
45+
if job.time_limit is not None:
46+
preamble.append(
47+
self._format_option(f'-W {int(job.time_limit // 60)}')
48+
)
49+
50+
# emit the rest of the options
51+
options = job.options + job.cli_options
52+
for opt in options:
53+
if opt.startswith('#'):
54+
preamble.append(opt)
55+
else:
56+
preamble.append(self._format_option(opt))
57+
58+
# change to working dir with cd
59+
preamble.append(f'cd {job.workdir}')
60+
61+
return preamble
62+
63+
def submit(self, job):
64+
cmd = f'bsub {job.script_filename}'
65+
completed = _run_strict(cmd, timeout=self._submit_timeout)
66+
jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted',
67+
completed.stdout)
68+
if not jobid_match:
69+
raise JobSchedulerError('could not retrieve the job id '
70+
'of the submitted job')
71+
72+
job._jobid = jobid_match.group('jobid')
73+
job._submit_time = time.time()
74+
75+
def poll(self, *jobs):
76+
if jobs:
77+
# filter out non-jobs
78+
jobs = [job for job in jobs if job is not None]
79+
80+
if not jobs:
81+
return
82+
83+
completed = _run_strict(
84+
f'bjobs -noheader {" ".join(job.jobid for job in jobs)}'
85+
)
86+
job_status = {}
87+
job_status_lines = completed.stdout.split('\n')
88+
89+
for line in job_status_lines:
90+
job_regex = (r'(?P<jobid>\d+)\s+'
91+
r'(?P<user>\S+)\s+'
92+
r'(?P<status>\S+)\s+'
93+
r'(?P<queue>\S+)')
94+
job_match = re.search(job_regex, line)
95+
if job_match:
96+
job_status[job_match['jobid']] = job_match['status']
97+
98+
for job in jobs:
99+
if job.jobid not in job_status:
100+
# job id not found
101+
self.log(f'Job {job.jobid} not known to scheduler, '
102+
f'assuming job completed')
103+
job._state = 'COMPLETED'
104+
job._completed = True
105+
elif job_status[job.jobid] in ('DONE', 'EXIT'):
106+
# job done
107+
job._state = 'COMPLETED'
108+
job._completed = True
109+
elif job_status[job.jobid] == 'RUN':
110+
# job running
111+
job._state = 'RUNNING'
112+
elif job_status[job.jobid] == 'PEND':
113+
# job pending
114+
job._state = 'PENDING'
115+
elif job_status[job.jobid] in ['PSUSP', 'SSUSP', 'USUSP']:
116+
# job suspended
117+
job._state = 'SUSPENDED'
118+
else:
119+
# job status unknown
120+
self.log(f'Job {job_status[job.jobid]} not known, '
121+
f'assuming job completed')
122+
job._state = 'COMPLETED'
123+
job._completed = True
124+
125+
def finished(self, job):
126+
if job.exception:
127+
raise job.exception
128+
129+
return job.state == 'COMPLETED'

reframe/core/schedulers/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,4 @@ def getscheduler(name):
4040
import reframe.core.schedulers.pbs # noqa: F401, F403
4141
import reframe.core.schedulers.sge # noqa: F401, F403
4242
import reframe.core.schedulers.torque # noqa: F401, F403
43+
import reframe.core.schedulers.lsf # noqa: F401, F403

reframe/schemas/config.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,15 +248,15 @@
248248
"type": "string",
249249
"enum": [
250250
"local", "pbs", "slurm",
251-
"sge", "squeue", "torque"
251+
"sge", "squeue", "torque", "lsf"
252252
]
253253
},
254254
"launcher": {
255255
"type": "string",
256256
"enum": [
257257
"alps", "ibrun", "local", "mpirun",
258258
"mpiexec", "srun", "srunalloc", "ssh",
259-
"upcrun", "upcxx-run"
259+
"upcrun", "upcxx-run", "lrun", "lrun-gpu"
260260
]
261261
},
262262
"access": {
@@ -370,7 +370,7 @@
370370
"properties": {
371371
"name": {
372372
"type": "string",
373-
"enum": ["local", "pbs", "sge", "slurm", "squeue", "torque"]
373+
"enum": ["local", "pbs", "sge", "slurm", "squeue", "torque", "lsf"]
374374
},
375375
"ignore_reqnodenotavail": {"type": "boolean"},
376376
"resubmit_on_errors": {

unittests/test_launchers.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111

1212

1313
@pytest.fixture(params=[
14-
'alps', 'launcherwrapper', 'local', 'mpiexec',
15-
'mpirun', 'srun', 'srunalloc', 'ssh', 'upcrun', 'upcxx-run'
14+
'alps', 'launcherwrapper', 'local', 'mpiexec', 'mpirun',
15+
'srun', 'srunalloc', 'ssh', 'upcrun', 'upcxx-run', 'lrun', 'lrun-gpu'
1616
])
1717
def launcher(request):
1818
if request.param == 'launcherwrapper':
@@ -139,6 +139,10 @@ def test_run_command(job):
139139
assert command == 'upcrun -N 2 -n 4 --foo'
140140
elif launcher_name == 'upcxx-run':
141141
assert command == 'upcxx-run -N 2 -n 4 --foo'
142+
elif launcher_name == 'lrun':
143+
assert command == 'lrun -N 2 -T 2 --foo'
144+
elif launcher_name == 'lrun-gpu':
145+
assert command == 'lrun -N 2 -T 2 -M "-gpu" --foo'
142146

143147

144148
def test_run_command_minimal(minimal_job):
@@ -169,3 +173,7 @@ def test_run_command_minimal(minimal_job):
169173
assert command == 'upcrun -n 1 --foo'
170174
elif launcher_name == 'upcxx-run':
171175
assert command == 'upcxx-run -n 1 --foo'
176+
elif launcher_name == 'lrun':
177+
assert command == 'lrun -N 1 -T 1 --foo'
178+
elif launcher_name == 'lrun-gpu':
179+
assert command == 'lrun -N 1 -T 1 -M "-gpu" --foo'

unittests/test_schedulers.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ def launcher():
2626
return getlauncher('local')
2727

2828

29-
@pytest.fixture(params=['sge', 'slurm', 'squeue', 'local', 'pbs', 'torque'])
29+
@pytest.fixture(params=['sge', 'slurm', 'squeue', 'local',
30+
'pbs', 'torque', 'lsf'])
3031
def scheduler(request):
3132
return getscheduler(request.param)
3233

@@ -132,6 +133,23 @@ def assert_job_script_sanity(job):
132133
'echo postrun'] == matches
133134

134135

136+
def _expected_lsf_directives(job):
137+
num_tasks = job.num_tasks or 1
138+
num_tasks_per_node = job.num_tasks_per_node or 1
139+
num_nodes = int(num_tasks // num_tasks_per_node)
140+
return set([
141+
f'#BSUB -J testjob',
142+
f'#BSUB -o {job.stdout}',
143+
f'#BSUB -e {job.stderr}',
144+
f'#BSUB -nnodes {num_nodes}',
145+
f'#BSUB -W {int(job.time_limit // 60)}',
146+
f'#BSUB --account=spam',
147+
f'#BSUB --gres=gpu:4',
148+
f'#DW jobdw capacity=100GB',
149+
f'#DW stage_in source=/foo',
150+
])
151+
152+
135153
def _expected_sge_directives(job):
136154
num_nodes = job.num_tasks // job.num_tasks_per_node
137155
num_cpus_per_node = job.num_cpus_per_task * job.num_tasks_per_node

0 commit comments

Comments
 (0)