Skip to content

Commit 38e94a2

Browse files
author
Vasileios Karakasis
authored
Merge pull request #1172 from smoors/torque
[feat] Add support for the Torque scheduler
2 parents 4d8190e + 85203ad commit 38e94a2

File tree

7 files changed

+327
-46
lines changed

7 files changed

+327
-46
lines changed

ci-scripts/ci-runner.bash

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -187,14 +187,15 @@ else
187187

188188
checked_exec ./test_reframe.py --rfm-user-config=config/cscs-ci.py
189189

190-
echo "==================================="
191-
echo "Running unit tests with PBS backend"
192-
echo "==================================="
193-
194190
if [[ $(hostname) =~ dom ]]; then
195191
PATH_save=$PATH
196-
export PATH=/apps/dom/UES/karakasv/slurm-wrappers/bin:$PATH
197-
checked_exec ./test_reframe.py --rfm-user-config=config/cscs-pbs.py
192+
for backend in pbs torque; do
193+
echo "=================================="
194+
echo "Running unit tests with ${backend}"
195+
echo "=================================="
196+
export PATH=/apps/dom/UES/karakasv/slurm-wrappers/bin:$PATH
197+
checked_exec ./test_reframe.py --rfm-user-config=config/cscs-${backend}.py
198+
done
198199
export PATH=$PATH_save
199200
fi
200201

config/cscs-torque.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
#
7+
# Minimal CSCS configuration for testing the Torque backend
8+
#
9+
10+
11+
class ReframeSettings:
12+
job_poll_intervals = [1, 2, 3]
13+
job_submit_timeout = 60
14+
checks_path = ['checks/']
15+
checks_path_recurse = True
16+
site_configuration = {
17+
'systems': {
18+
'dom': {
19+
'descr': 'Dom TDS',
20+
'hostnames': ['dom'],
21+
'modules_system': 'tmod',
22+
'resourcesdir': '/apps/common/UES/reframe/resources',
23+
'partitions': {
24+
'login': {
25+
'scheduler': 'local',
26+
'modules': [],
27+
'access': [],
28+
'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
29+
'PrgEnv-intel', 'PrgEnv-pgi'],
30+
'descr': 'Login nodes',
31+
'max_jobs': 4
32+
},
33+
34+
'gpu': {
35+
'scheduler': 'torque+mpiexec',
36+
'modules': ['daint-gpu'],
37+
'access': ['-l proc=gpu'],
38+
'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
39+
'PrgEnv-intel', 'PrgEnv-pgi'],
40+
'descr': 'Hybrid nodes (Haswell/P100)',
41+
'max_jobs': 100,
42+
},
43+
44+
'mc': {
45+
'scheduler': 'torque+mpiexec',
46+
'modules': ['daint-mc'],
47+
'access': ['-l proc=mc'],
48+
'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
49+
'PrgEnv-intel', 'PrgEnv-pgi'],
50+
'descr': 'Multicore nodes (Broadwell)',
51+
'max_jobs': 100,
52+
},
53+
}
54+
},
55+
56+
'generic': {
57+
'descr': 'Generic example system',
58+
'partitions': {
59+
'login': {
60+
'scheduler': 'local',
61+
'modules': [],
62+
'access': [],
63+
'environs': ['builtin-gcc'],
64+
'descr': 'Login nodes'
65+
}
66+
}
67+
}
68+
},
69+
70+
'environments': {
71+
'*': {
72+
'PrgEnv-cray': {
73+
'modules': ['PrgEnv-cray'],
74+
},
75+
76+
'PrgEnv-gnu': {
77+
'modules': ['PrgEnv-gnu'],
78+
},
79+
80+
'PrgEnv-intel': {
81+
'modules': ['PrgEnv-intel'],
82+
},
83+
84+
'PrgEnv-pgi': {
85+
'modules': ['PrgEnv-pgi'],
86+
},
87+
88+
'builtin': {
89+
'cc': 'cc',
90+
'cxx': '',
91+
'ftn': '',
92+
},
93+
94+
'builtin-gcc': {
95+
'cc': 'gcc',
96+
'cxx': 'g++',
97+
'ftn': 'gfortran',
98+
}
99+
}
100+
},
101+
}
102+
103+
logging_config = {
104+
'level': 'DEBUG',
105+
'handlers': [
106+
{
107+
'type': 'file',
108+
'name': 'reframe.log',
109+
'level': 'DEBUG',
110+
'format': '[%(asctime)s] %(levelname)s: '
111+
'%(check_info)s: %(message)s',
112+
'append': False,
113+
},
114+
115+
# Output handling
116+
{
117+
'type': 'stream',
118+
'name': 'stdout',
119+
'level': 'INFO',
120+
'format': '%(message)s'
121+
},
122+
{
123+
'type': 'file',
124+
'name': 'reframe.out',
125+
'level': 'INFO',
126+
'format': '%(message)s',
127+
'append': False,
128+
}
129+
]
130+
}
131+
132+
perf_logging_config = {
133+
'level': 'DEBUG',
134+
'handlers': [
135+
{
136+
'type': 'filelog',
137+
'prefix': '%(check_system)s/%(check_partition)s',
138+
'level': 'INFO',
139+
'format': (
140+
'%(check_job_completion_time)s|reframe %(version)s|'
141+
'%(check_info)s|jobid=%(check_jobid)s|'
142+
'num_tasks=%(check_num_tasks)s|'
143+
'%(check_perf_var)s=%(check_perf_value)s|'
144+
'ref=%(check_perf_ref)s '
145+
'(l=%(check_perf_lower_thres)s, '
146+
'u=%(check_perf_upper_thres)s)'
147+
),
148+
'datefmt': '%FT%T%:z',
149+
'append': True
150+
}
151+
]
152+
}
153+
154+
155+
settings = ReframeSettings()

docs/configure.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ The available partition attributes are the following:
248248
}
249249
250250
.. note::
251-
For the `PBS <#supported-scheduler-backends>`__ backend, options accepted in the ``access`` and ``resources`` attributes may either refer to actual ``qsub`` options or be just resources specifications to be passed to the ``-l select`` option.
251+
For the `PBS <#supported-scheduler-backends>`__ and `Torque <#supported-scheduler-backends>`__ backends, options accepted in the ``access`` and ``resources`` attributes may either refer to actual ``qsub`` options or be just resources specifications to be passed to the ``-l`` option.
252252
The backend assumes a ``qsub`` option, if the options passed in these attributes start with a ``-``.
253253

254254
.. note::
@@ -280,7 +280,8 @@ ReFrame supports the following job schedulers:
280280
The job status is obtained using the ``squeue`` command.
281281
This scheduler is less reliable than the one based on the ``sacct`` command, but the framework does its best to query the job state as reliably as possible.
282282

283-
* ``pbs``: *[new in 2.13]* Jobs on the configured partition will be launched using a `PBS-based <https://en.wikipedia.org/wiki/Portable_Batch_System>`__ scheduler.
283+
* ``pbs``: *[new in 2.13]* Jobs on the configured partition will be launched using the `PBS Pro <https://en.wikipedia.org/wiki/Portable_Batch_System>`__ scheduler.
284+
* ``torque``: *[new in 3.0]* Jobs on the configured partition will be launched using the `Torque <https://en.wikipedia.org/wiki/TORQUE>`__ scheduler.
284285
* ``local``: Jobs on the configured partition will be launched locally as OS processes.
285286

286287

reframe/core/schedulers/pbs.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535

3636
@register_scheduler('pbs')
3737
class PbsJobScheduler(sched.JobScheduler):
38+
TASKS_OPT = ('-l select={num_nodes}:mpiprocs={num_tasks_per_node}'
39+
':ncpus={num_cpus_per_node}')
40+
3841
def __init__(self):
3942
self._prefix = '#PBS'
4043
self._time_finished = None
@@ -50,9 +53,11 @@ def _emit_lselect_option(self, job):
5053
num_cpus_per_task = job.num_cpus_per_task or 1
5154
num_nodes = job.num_tasks // num_tasks_per_node
5255
num_cpus_per_node = num_tasks_per_node * num_cpus_per_task
53-
select_opt = '-l select=%s:mpiprocs=%s:ncpus=%s' % (num_nodes,
54-
num_tasks_per_node,
55-
num_cpus_per_node)
56+
select_opt = self.TASKS_OPT.format(
57+
num_nodes=num_nodes,
58+
num_tasks_per_node=num_tasks_per_node,
59+
num_cpus_per_node=num_cpus_per_node
60+
)
5661

5762
# Options starting with `-` are emitted in separate lines
5863
rem_opts = []
@@ -112,7 +117,7 @@ def submit(self, job):
112117
raise JobError('could not retrieve the job id '
113118
'of the submitted job')
114119

115-
jobid, *info = jobid_match.group('jobid').split('.', maxsplit=2)
120+
jobid, *info = jobid_match.group('jobid').split('.', maxsplit=1)
116121
job.jobid = int(jobid)
117122
if info:
118123
self._pbs_server = info[0]

reframe/core/schedulers/registry.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def getscheduler(name):
3535

3636

3737
# Import the schedulers modules to trigger their registration
38-
import reframe.core.schedulers.local # noqa: F401, F403
39-
import reframe.core.schedulers.slurm # noqa: F401, F403
40-
import reframe.core.schedulers.pbs # noqa: F401, F403
38+
import reframe.core.schedulers.local # noqa: F401, F403
39+
import reframe.core.schedulers.slurm # noqa: F401, F403
40+
import reframe.core.schedulers.pbs # noqa: F401, F403
41+
import reframe.core.schedulers.torque # noqa: F401, F403

reframe/core/schedulers/torque.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# Torque backend
7+
#
8+
# - Initial version submitted by Samuel Moors, Vrije Universiteit Brussel (VUB)
9+
#
10+
11+
import re
12+
13+
import reframe.utility.os_ext as os_ext
14+
from reframe.core.config import settings
15+
from reframe.core.exceptions import JobError
16+
from reframe.core.logging import getlogger
17+
from reframe.core.schedulers.pbs import PbsJobScheduler, _run_strict
18+
from reframe.core.schedulers.registry import register_scheduler
19+
20+
21+
JOB_STATES = {
22+
'Q': 'QUEUED',
23+
'H': 'HELD',
24+
'R': 'RUNNING',
25+
'E': 'EXITING',
26+
'T': 'MOVED',
27+
'W': 'WAITING',
28+
'S': 'SUSPENDED',
29+
'C': 'COMPLETED',
30+
}
31+
32+
33+
@register_scheduler('torque')
34+
class TorqueJobScheduler(PbsJobScheduler):
35+
TASKS_OPT = '-l nodes={num_nodes}:ppn={num_cpus_per_node}'
36+
37+
def _update_state(self, job):
38+
'''Check the status of the job.'''
39+
40+
completed = os_ext.run_command('qstat -f %s' % job.jobid)
41+
42+
# Depending on the configuration, completed jobs will remain on the job
43+
# list for a limited time, or be removed upon completion.
44+
# If qstat cannot find the jobid, it returns code 153.
45+
if completed.returncode == 153:
46+
getlogger().debug(
47+
'jobid not known by scheduler, assuming job completed'
48+
)
49+
job.state = 'COMPLETED'
50+
return
51+
52+
if completed.returncode != 0:
53+
raise JobError('qstat failed: %s' % completed.stderr, job.jobid)
54+
55+
state_match = re.search(
56+
r'^\s*job_state = (?P<state>[A-Z])', completed.stdout, re.MULTILINE
57+
)
58+
if not state_match:
59+
getlogger().debug(
60+
'job state not found (stdout follows)\n%s' % completed.stdout
61+
)
62+
return
63+
64+
state = state_match.group('state')
65+
job.state = JOB_STATES[state]
66+
if job.state == 'COMPLETED':
67+
code_match = re.search(
68+
r'^\s*exit_status = (?P<code>\d+)',
69+
completed.stdout,
70+
re.MULTILINE,
71+
)
72+
if not code_match:
73+
return
74+
75+
job.exitcode = int(code_match.group('code'))
76+
77+
def finished(self, job):
78+
try:
79+
self._update_state(job)
80+
except JobError as e:
81+
# We ignore these exceptions at this point and we simply mark the
82+
# job as unfinished.
83+
getlogger().debug('ignoring error during polling: %s' % e)
84+
return False
85+
else:
86+
return job.state == 'COMPLETED'

0 commit comments

Comments
 (0)