Skip to content

Commit 0da4266

Browse files
author
Vasileios Karakasis
authored
Merge branch 'master' into test/slurm-constraint-error
2 parents ee362e3 + 3cc72fc commit 0da4266

File tree

10 files changed

+116
-14
lines changed

10 files changed

+116
-14
lines changed

docs/running.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,14 @@ All handlers accept the following set of attributes (keys) in their configuratio
846846
It can be configured on a per test basis by overriding the :func:`info <reframe.core.pipeline.RegressionTest.info>` method of a specific regression test.
847847
- ``check_jobid``: Prints the job or process id of the job or process associated with the currently executing regression test.
848848
If a job or process is not yet created, ``-1`` will be printed.
849+
- ``check_job_completion_time``: *[new in 2.21]* The completion time of the job spawned by this regression test.
850+
This timestamp will be formatted according to ``datefmt`` (see below).
851+
The accuracy of the timestamp depends on the backend scheduler.
852+
The ``slurm`` scheduler backend relies on job accounting and returns the actual termination time of the job.
853+
The rest of the backends report as completion time the moment when the framework realizes that the spawned job has finished.
854+
In this case, the accuracy depends on the execution policy used.
855+
If tests are executed with the serial execution policy, this is close to the real completion time, but if the asynchronous execution policy is used, it can differ significantly.
856+
If the job completion time cannot be retrieved, ``None`` will be printed.
849857
- ``check_name``: Prints the name of the regression test on behalf of which ReFrame is currently executing.
850858
If ReFrame is not in the context of regression test, ``reframe`` will be printed.
851859
- ``check_num_tasks``: The number of tasks assigned to the regression test.

reframe/core/environments.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,22 @@ def emit_load_commands(*environs):
144144
return commands
145145

146146

147+
class temp_environment:
148+
'''Context manager to temporarily change the environment.'''
149+
150+
def __init__(self, modules=[], variables=[]):
151+
self._modules = modules
152+
self._variables = variables
153+
154+
def __enter__(self):
155+
new_env = Environment('_rfm_temp_env', self._modules, self._variables)
156+
self._environ_save, _ = load(new_env)
157+
return new_env
158+
159+
def __exit__(self, exc_type, exc_value, traceback):
160+
self._environ_save.restore()
161+
162+
147163
class ProgEnvironment(Environment):
148164
'''A class representing a programming environment.
149165

reframe/core/logging.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ def __init__(self, logger=None, check=None):
367367
{
368368
'check_name': 'reframe',
369369
'check_jobid': '-1',
370+
'check_job_completion_time': None,
370371
'check_info': 'reframe',
371372
'check_system': None,
372373
'check_partition': None,
@@ -426,6 +427,13 @@ def _update_check_extras(self):
426427

427428
if self.check.job:
428429
self.extra['check_jobid'] = self.check.job.jobid
430+
if self.check.job.completion_time:
431+
# Use the logging handlers' date format to format
432+
# completion_time
433+
# NOTE: All handlers use the same date format
434+
fmt = self.logger.handlers[0].formatter.datefmt
435+
ct = self.check.job.completion_time.strftime(fmt)
436+
self.extra['check_job_completion_time'] = ct
429437

430438
def log_performance(self, level, tag, value, ref,
431439
low_thres, upper_thres, unit=None, *, msg=None):

reframe/core/schedulers/__init__.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#
44

55
import abc
6+
from datetime import datetime
67

78
import reframe.core.environments as env
89
import reframe.core.fields as fields
@@ -14,6 +15,10 @@
1415

1516

1617
class JobScheduler(abc.ABC):
18+
@abc.abstractmethod
19+
def completion_time(self, job):
20+
pass
21+
1722
@abc.abstractmethod
1823
def emit_preamble(self, job):
1924
pass
@@ -189,6 +194,7 @@ def __init__(self,
189194
self._script_filename = script_filename or '%s.sh' % name
190195
self._stdout = stdout or '%s.out' % name
191196
self._stderr = stderr or '%s.err' % name
197+
self._completion_time = None
192198

193199
# Backend scheduler related information
194200
self._sched_flex_alloc_nodes = sched_flex_alloc_nodes
@@ -259,6 +265,10 @@ def sched_account(self):
259265
def sched_exclusive_access(self):
260266
return self._sched_exclusive_access
261267

268+
@property
269+
def completion_time(self):
270+
return self.scheduler.completion_time(self) or self._completion_time
271+
262272
def prepare(self, commands, environs=None, **gen_opts):
263273
environs = environs or []
264274
if self.num_tasks <= 0:
@@ -321,7 +331,8 @@ def wait(self):
321331
if self.jobid is None:
322332
raise JobNotStartedError('cannot wait an unstarted job')
323333

324-
return self.scheduler.wait(self)
334+
self.scheduler.wait(self)
335+
self._completion_time = self._completion_time or datetime.now()
325336

326337
def cancel(self):
327338
if self.jobid is None:
@@ -333,7 +344,11 @@ def finished(self):
333344
if self.jobid is None:
334345
raise JobNotStartedError('cannot poll an unstarted job')
335346

336-
return self.scheduler.finished(self)
347+
done = self.scheduler.finished(self)
348+
if done:
349+
self._completion_time = self._completion_time or datetime.now()
350+
351+
return done
337352

338353

339354
class Node(abc.ABC):

reframe/core/schedulers/local.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ def __init__(self):
3030
self._f_stdout = None
3131
self._f_stderr = None
3232

33+
def completion_time(self, job):
34+
return None
35+
3336
def submit(self, job):
3437
# `chmod +x' first, because we will execute the script locally
3538
os.chmod(job.script_filename,

reframe/core/schedulers/pbs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ def __init__(self):
3636
# Optional part of the job id refering to the PBS server
3737
self._pbs_server = None
3838

39+
def completion_time(self, job):
40+
return None
41+
3942
def _emit_lselect_option(self, job):
4043
num_tasks_per_node = job.num_tasks_per_node or 1
4144
num_cpus_per_task = job.num_cpus_per_task or 1

reframe/core/schedulers/slurm.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from contextlib import suppress
88
from datetime import datetime
99

10+
import reframe.core.environments as env
1011
import reframe.core.schedulers as sched
1112
import reframe.utility.os_ext as os_ext
1213
from reframe.core.config import settings
@@ -69,6 +70,13 @@ class SlurmJobScheduler(sched.JobScheduler):
6970
# standard job state polling using sacct.
7071
SACCT_SQUEUE_RATIO = 10
7172

73+
# This matches the format for both normal jobs as well as job arrays.
74+
# For job arrays the job_id has one of the following formats:
75+
# * <job_id>_<array_task_id>
76+
# * <job_id>_[<array_task_id_start>-<array_task_id_end>]
77+
# See (`Job Array Support<https://slurm.schedmd.com/job_array.html`__)
78+
_state_patt = r'\d+(?:_\d+|_\[\d+-\d+\])?'
79+
7280
def __init__(self):
7381
self._prefix = '#SBATCH'
7482

@@ -88,6 +96,32 @@ def __init__(self):
8896
self._is_cancelling = False
8997
self._is_job_array = None
9098
self._update_state_count = 0
99+
self._completion_time = None
100+
101+
def completion_time(self, job):
102+
if (self._completion_time or
103+
not slurm_state_completed(job.state)):
104+
return self._completion_time
105+
106+
with env.temp_environment(variables={'SLURM_TIME_FORMAT': 'standard'}):
107+
completed = os_ext.run_command(
108+
'sacct -S %s -P -j %s -o jobid,end' %
109+
(datetime.now().strftime('%F'), job.jobid),
110+
log=False
111+
)
112+
113+
state_match = list(re.finditer(
114+
r'^(?P<jobid>%s)\|(?P<end>\S+)' % self._state_patt,
115+
completed.stdout, re.MULTILINE))
116+
if not state_match:
117+
return None
118+
119+
self._completion_time = max(
120+
datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S')
121+
for s in state_match
122+
)
123+
124+
return self._completion_time
91125

92126
def _format_option(self, var, option):
93127
if var is not None:
@@ -296,14 +330,9 @@ def _update_state(self, job):
296330
)
297331
self._update_state_count += 1
298332

299-
# This matches the format for both normal jobs as well as job arrays.
300-
# For job arrays the job_id has one of the following formats:
301-
# * <job_id>_<array_task_id>
302-
# * <job_id>_[<array_task_id_start>-<array_task_id_end>]
303-
# See (`Job Array Support<https://slurm.schedmd.com/job_array.html`__)
304333
state_match = list(re.finditer(
305-
r'^(?P<jobid>\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P<state>\S+)([^\|]*)\|'
306-
r'(?P<exitcode>\d+)\:(?P<signal>\d+)\|(?P<nodespec>.*)',
334+
r'^(?P<jobid>%s)\|(?P<state>\S+)([^\|]*)\|(?P<exitcode>\d+)\:'
335+
r'(?P<signal>\d+)\|(?P<nodespec>.*)' % self._state_patt,
307336
completed.stdout, re.MULTILINE))
308337
if not state_match:
309338
getlogger().debug('job state not matched (stdout follows)\n%s' %
@@ -312,7 +341,7 @@ def _update_state(self, job):
312341

313342
# Join the states with ',' in case of job arrays
314343
job.state = ','.join(s.group('state') for s in state_match)
315-
if not self._update_state_count % SlurmJobScheduler.SACCT_SQUEUE_RATIO:
344+
if not self._update_state_count % self.SACCT_SQUEUE_RATIO:
316345
self._cancel_if_blocked(job)
317346

318347
if slurm_state_completed(job.state):
@@ -438,6 +467,9 @@ def __init__(self):
438467
self._squeue_delay = 2
439468
self._cancelled = False
440469

470+
def completion_time(self, job):
471+
return None
472+
441473
def submit(self, job):
442474
super().submit(job)
443475
self._submit_time = datetime.now()

reframe/utility/os_ext.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
SpawnedProcessTimeout)
2020

2121

22-
def run_command(cmd, check=False, timeout=None, shell=False):
22+
def run_command(cmd, check=False, timeout=None, shell=False, log=True):
2323
try:
24-
proc = run_command_async(cmd, shell=shell, start_new_session=True)
24+
proc = run_command_async(cmd, shell=shell, start_new_session=True,
25+
log=log)
2526
proc_stdout, proc_stderr = proc.communicate(timeout=timeout)
2627
except subprocess.TimeoutExpired as e:
2728
os.killpg(proc.pid, signal.SIGKILL)
@@ -65,11 +66,13 @@ def run_command_async(cmd,
6566
stdout=subprocess.PIPE,
6667
stderr=subprocess.PIPE,
6768
shell=False,
69+
log=True,
6870
**popen_args):
6971
# Import logger here to avoid unnecessary circular dependencies
70-
from reframe.core.logging import getlogger
72+
if log:
73+
from reframe.core.logging import getlogger
74+
getlogger().debug('executing OS command: ' + cmd)
7175

72-
getlogger().debug('executing OS command: ' + cmd)
7376
if not shell:
7477
cmd = shlex.split(cmd)
7578

unittests/test_environments.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,16 @@ def test_load_restore(self):
8888

8989
assert not self.environ.is_loaded
9090

91+
@fixtures.switch_to_user_runtime
92+
def test_temp_environment(self):
93+
self.setup_modules_system()
94+
with env.temp_environment(
95+
['testmod_foo'], {'_var0': 'val2', '_var3': 'val3'}
96+
) as environ:
97+
assert environ.is_loaded
98+
99+
assert not environ.is_loaded
100+
91101
@fixtures.switch_to_user_runtime
92102
def test_load_already_present(self):
93103
self.setup_modules_system()

unittests/test_launchers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77

88

99
class FakeJobScheduler(JobScheduler):
10+
@property
11+
def completion_time(self, job):
12+
pass
13+
1014
def emit_preamble(self, job):
1115
pass
1216

0 commit comments

Comments
 (0)