Skip to content

Commit 792939c

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2129 from mahendrapaipuri/feature/oar-scheduler
[feat] Add support for the OAR scheduler
2 parents aacf20f + ca1a9af commit 792939c

File tree

7 files changed

+226
-14
lines changed

7 files changed

+226
-14
lines changed

docs/config_reference.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,19 +210,22 @@ System Partition Configuration
210210
Supported schedulers are the following:
211211

212212
- ``local``: Jobs will be launched locally without using any job scheduler.
213+
- ``oar``: Jobs will be launched using the `OAR <https://oar.imag.fr/>`__ scheduler.
213214
- ``pbs``: Jobs will be launched using the `PBS Pro <https://en.wikipedia.org/wiki/Portable_Batch_System>`__ scheduler.
214-
- ``torque``: Jobs will be launched using the `Torque <https://en.wikipedia.org/wiki/TORQUE>`__ scheduler.
215215
- ``sge``: Jobs will be launched using the `Sun Grid Engine <https://arc.liv.ac.uk/SGE/htmlman/manuals.html>`__ scheduler.
216216
- ``slurm``: Jobs will be launched using the `Slurm <https://www.schedmd.com/>`__ scheduler.
217217
This backend requires job accounting to be enabled in the target system.
218218
If not, you should consider using the ``squeue`` backend below.
219219
- ``squeue``: Jobs will be launched using the `Slurm <https://www.schedmd.com/>`__ scheduler.
220220
This backend does not rely on job accounting to retrieve job statuses, but ReFrame does its best to query the job state as reliably as possible.
221-
- ``lsf``: Jobs will be launched using the `LSF <https://www.ibm.com/docs/en/spectrum-lsf/>`__ scheduler.
221+
- ``torque``: Jobs will be launched using the `Torque <https://en.wikipedia.org/wiki/TORQUE>`__ scheduler.
222222

223223
.. versionadded:: 3.7.2
224224
Support for the SGE scheduler is added.
225225

226+
.. versionadded:: 3.8.2
227+
Support for the OAR scheduler is added.
228+
226229
.. note::
227230

228231
The way that multiple node jobs are submitted using the SGE scheduler can be very site-specific.

docs/tutorial_tips_tricks.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -552,8 +552,10 @@ The following is an example of ``.gitlab-ci.yml`` file that does exactly that:
552552
It defines two stages.
553553
The first one, called ``generate``, will call ReFrame to generate the pipeline specification for the desired tests.
554554
All the usual `test selection options <manpage.html#test-filtering>`__ can be used to select specific tests.
555-
ReFrame will process them as usual, but instead of running the selected tests, it will generate the correct steps for running each test individually as a Gitlab job.
556-
We then pass the generated CI pipeline file to second phase as an artifact and we are done!
555+
ReFrame will process them as usual, but instead of running the selected tests, it will generate the correct steps
556+
for running each test individually as a Gitlab job. We then pass the generated CI pipeline file to second phase as
557+
an artifact and we are done! If ``image`` keyword is defined in ``.gitlab-ci.yml``, the emitted pipeline will use
558+
the same image as the one defined in the parent pipeline.
557559

558560
The following figure shows one part of the automatically generated pipeline for the test graph depicted `above <#fig-deps-complex>`__.
559561

reframe/core/backends.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@
1818
_launchers = {}
1919
_scheduler_backend_modules = [
2020
'reframe.core.schedulers.local',
21-
'reframe.core.schedulers.slurm',
21+
'reframe.core.schedulers.lsf',
2222
'reframe.core.schedulers.pbs',
23+
'reframe.core.schedulers.oar',
2324
'reframe.core.schedulers.sge',
24-
'reframe.core.schedulers.lsf'
25+
'reframe.core.schedulers.slurm'
2526
]
2627
_schedulers = {}
2728

reframe/core/schedulers/oar.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
#
7+
# OAR backend
8+
#
9+
# - Initial version submitted by Mahendra Paipuri, INRIA
10+
#
11+
12+
import functools
13+
import os
14+
import re
15+
import time
16+
17+
import reframe.core.runtime as rt
18+
import reframe.utility.osext as osext
19+
from reframe.core.backends import register_scheduler
20+
from reframe.core.exceptions import JobError, JobSchedulerError
21+
from reframe.core.schedulers.pbs import PbsJobScheduler
22+
from reframe.utility import seconds_to_hms
23+
24+
25+
# States can be found here:
26+
# https://github.com/oar-team/oar/blob/0fccc4fc3bb86ee935ce58effc5aec514a3e155d/sources/core/qfunctions/oarstat#L293
27+
def oar_state_completed(state):
28+
completion_states = {
29+
'Error',
30+
'Terminated',
31+
}
32+
if state:
33+
return all(s in completion_states for s in state.split(','))
34+
35+
return False
36+
37+
38+
def oar_state_pending(state):
39+
pending_states = {
40+
'Waiting',
41+
'toLaunch',
42+
'Launching',
43+
'Hold',
44+
'Running',
45+
'toError',
46+
'Finishing',
47+
'Suspended',
48+
'Resuming',
49+
}
50+
if state:
51+
return any(s in pending_states for s in state.split(','))
52+
53+
return False
54+
55+
56+
_run_strict = functools.partial(osext.run_command, check=True)
57+
58+
59+
@register_scheduler('oar')
60+
class OarJobScheduler(PbsJobScheduler):
61+
# host is de-facto nodes and core is number of cores requested per node
62+
# number of sockets can also be specified using cpu={num_sockets}
63+
TASKS_OPT = '-l /host={num_nodes}/core={num_tasks_per_node}'
64+
65+
def __init__(self):
66+
self._prefix = '#OAR'
67+
self._submit_timeout = rt.runtime().get_option(
68+
f'schedulers/@{self.registered_name}/job_submit_timeout'
69+
)
70+
71+
def emit_preamble(self, job):
72+
# Same reason as oarsub, we give full path to output and error files to
73+
# avoid writing them in the working dir
74+
preamble = [
75+
self._format_option(f'-n "{job.name}"'),
76+
self._format_option(f'-O {os.path.join(job.workdir, job.stdout)}'),
77+
self._format_option(f'-E {os.path.join(job.workdir, job.stderr)}'),
78+
]
79+
80+
if job.time_limit is not None:
81+
h, m, s = seconds_to_hms(job.time_limit)
82+
self.TASKS_OPT += ',walltime=%d:%d:%d' % (h, m, s)
83+
84+
# Get number of nodes in the reservation
85+
num_tasks_per_node = job.num_tasks_per_node or 1
86+
num_nodes = job.num_tasks // num_tasks_per_node
87+
88+
# Emit main resource reservation option
89+
options = [self.TASKS_OPT.format(
90+
num_nodes=num_nodes, num_tasks_per_node=num_tasks_per_node,
91+
)]
92+
93+
# Emit the rest of the options
94+
options += job.sched_access + job.options + job.cli_options
95+
for opt in options:
96+
if opt.startswith('#'):
97+
preamble.append(opt)
98+
else:
99+
preamble.append(self._format_option(opt))
100+
101+
# OAR starts the job in the home directory by default
102+
preamble.append(f'cd {job.workdir}')
103+
return preamble
104+
105+
def submit(self, job):
106+
# For some reason OAR job manager says that job launching dir is
107+
# working dir of the repo and not stage dir. A workaround is to give
108+
# full path of script to oarsub
109+
job_script_fullpath = os.path.join(job.workdir, job.script_filename)
110+
111+
# OAR needs -S to submit job in batch mode
112+
cmd = f'oarsub -S {job_script_fullpath}'
113+
completed = _run_strict(cmd, timeout=self._submit_timeout)
114+
jobid_match = re.search(r'.*OAR_JOB_ID=(?P<jobid>\S+)',
115+
completed.stdout)
116+
if not jobid_match:
117+
raise JobSchedulerError('could not retrieve the job id '
118+
'of the submitted job')
119+
120+
job._jobid = jobid_match.group('jobid')
121+
job._submit_time = time.time()
122+
123+
def cancel(self, job):
124+
_run_strict(f'oardel {job.jobid}', timeout=self._submit_timeout)
125+
job._cancelled = True
126+
127+
def poll(self, *jobs):
128+
if jobs:
129+
# Filter out non-jobs
130+
jobs = [job for job in jobs if job is not None]
131+
132+
if not jobs:
133+
return
134+
135+
for job in jobs:
136+
completed = _run_strict(
137+
f'oarstat -fj {job.jobid}'
138+
)
139+
140+
# Store information for each job separately
141+
jobinfo = {}
142+
143+
# Typical oarstat -fj <job_id> output:
144+
# https://github.com/oar-team/oar/blob/0fccc4fc3bb86ee935ce58effc5aec514a3e155d/sources/core/qfunctions/oarstat#L310
145+
job_raw_info = completed.stdout
146+
jobid_match = re.search(
147+
r'^Job_Id:\s*(?P<jobid>\S+)', completed.stdout, re.MULTILINE
148+
)
149+
if jobid_match:
150+
jobid = jobid_match.group('jobid')
151+
jobinfo[jobid] = job_raw_info
152+
153+
if job.jobid not in jobinfo:
154+
self.log(f'Job {job.jobid} not known to scheduler, '
155+
f'assuming job completed')
156+
job._state = 'Terminated'
157+
job._completed = True
158+
continue
159+
160+
info = jobinfo[job.jobid]
161+
state_match = re.search(
162+
r'^\s*state = (?P<state>[A-Z]\S+)', info, re.MULTILINE
163+
)
164+
if not state_match:
165+
self.log(f'Job state not found (job info follows):\n{info}')
166+
continue
167+
168+
job._state = state_match.group('state')
169+
if oar_state_completed(job.state):
170+
exitcode_match = re.search(
171+
r'^\s*exit_code = (?P<code>\d+)',
172+
info, re.MULTILINE,
173+
)
174+
175+
if exitcode_match:
176+
job._exitcode = int(exitcode_match.group('code'))
177+
178+
# We report a job as finished only when its stdout/stderr are
179+
# written back to the working directory
180+
stdout = os.path.join(job.workdir, job.stdout)
181+
stderr = os.path.join(job.workdir, job.stderr)
182+
out_ready = os.path.exists(stdout) and os.path.exists(stderr)
183+
done = job.cancelled or out_ready
184+
if done:
185+
job._completed = True
186+
elif oar_state_pending(job.state) and job.max_pending_time:
187+
if time.time() - job.submit_time >= job.max_pending_time:
188+
self.cancel(job)
189+
job._exception = JobError('maximum pending time exceeded',
190+
job.jobid)

reframe/core/schedulers/registry.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def getscheduler(name):
3636

3737
# Import the schedulers modules to trigger their registration
3838
import reframe.core.schedulers.local # noqa: F401, F403
39-
import reframe.core.schedulers.slurm # noqa: F401, F403
39+
import reframe.core.schedulers.lsf # noqa: F401, F403
40+
import reframe.core.schedulers.oar # noqa: F401, F403
4041
import reframe.core.schedulers.pbs # noqa: F401, F403
4142
import reframe.core.schedulers.sge # noqa: F401, F403
42-
import reframe.core.schedulers.torque # noqa: F401, F403
43-
import reframe.core.schedulers.lsf # noqa: F401, F403
43+
import reframe.core.schedulers.slurm # noqa: F401, F403

reframe/schemas/config.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,8 @@
247247
"scheduler": {
248248
"type": "string",
249249
"enum": [
250-
"local", "pbs", "slurm",
251-
"sge", "squeue", "torque", "lsf"
250+
"local", "lsf", "oar", "pbs",
251+
"sge", "slurm", "squeue", "torque"
252252
]
253253
},
254254
"launcher": {
@@ -370,7 +370,8 @@
370370
"properties": {
371371
"name": {
372372
"type": "string",
373-
"enum": ["local", "pbs", "sge", "slurm", "squeue", "torque", "lsf"]
373+
"enum": ["local", "lsf", "oar", "pbs",
374+
"sge", "slurm", "squeue", "torque"]
374375
},
375376
"ignore_reqnodenotavail": {"type": "boolean"},
376377
"resubmit_on_errors": {

unittests/test_schedulers.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def launcher():
2626
return getlauncher('local')
2727

2828

29-
@pytest.fixture(params=['sge', 'slurm', 'squeue', 'local',
30-
'pbs', 'torque', 'lsf'])
29+
@pytest.fixture(params=['local', 'lsf', 'oar', 'pbs',
30+
'sge', 'slurm', 'squeue', 'torque'])
3131
def scheduler(request):
3232
return getscheduler(request.param)
3333

@@ -226,6 +226,21 @@ def _expected_torque_directives(job):
226226
])
227227

228228

229+
def _expected_oar_directives(job):
230+
num_nodes = job.num_tasks // job.num_tasks_per_node
231+
num_tasks_per_node = job.num_tasks_per_node
232+
return set([
233+
f'#OAR -n "testjob"',
234+
f'#OAR -O {job.stdout}',
235+
f'#OAR -E {job.stderr}',
236+
f'#OAR -l /host={num_nodes}/core={num_tasks_per_node},walltime=0:5:0',
237+
f'#OAR --account=spam',
238+
f'#OAR --gres=gpu:4',
239+
f'#DW jobdw capacity=100GB',
240+
f'#DW stage_in source=/foo'
241+
])
242+
243+
229244
def _expected_local_directives(job):
230245
return set()
231246

0 commit comments

Comments
 (0)