|
| 1 | +# Copyright 2016-2022 Swiss National Supercomputing Centre (CSCS/ETH Zurich) |
| 2 | +# ReFrame Project Developers. See the top-level LICENSE file for details. |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: BSD-3-Clause |
| 5 | + |
| 6 | +# |
| 7 | +# Flux-Framework backend |
| 8 | +# |
| 9 | +# - Initial version submitted by Vanessa Sochat, |
| 10 | +# Lawrence Livermore National Lab |
| 11 | +# |
| 12 | + |
| 13 | +import itertools |
| 14 | +import os |
| 15 | +import time |
| 16 | + |
| 17 | +import reframe.core.runtime as rt |
| 18 | +from reframe.core.backends import register_scheduler |
| 19 | +from reframe.core.exceptions import JobError |
| 20 | +from reframe.core.schedulers import JobScheduler, Job |
| 21 | + |
| 22 | +# Just import flux once |
| 23 | +try: |
| 24 | + import flux |
| 25 | + import flux.job |
| 26 | + from flux.job import JobspecV1 |
| 27 | +except ImportError: |
| 28 | + error = 'no flux Python bindings found' |
| 29 | +else: |
| 30 | + error = None |
| 31 | + |
| 32 | +WAITING_STATES = ('QUEUED', 'HELD', 'WAITING', 'PENDING') |
| 33 | + |
| 34 | + |
| 35 | +class _FluxJob(Job): |
| 36 | + def __init__(self, *args, **kwargs): |
| 37 | + '''Create the flux job (and future) to watch.''' |
| 38 | + super().__init__(*args, **kwargs) |
| 39 | + |
| 40 | + # Generate the flux job |
| 41 | + self.fluxjob = JobspecV1.from_command( |
| 42 | + command=['/bin/bash', self.script_filename], |
| 43 | + num_tasks=self.num_tasks_per_core or 1, |
| 44 | + cores_per_task=self.num_cpus_per_task or 1, |
| 45 | + ) |
| 46 | + |
| 47 | + # We must use absolute paths for Flux |
| 48 | + out = os.path.join(os.path.abspath(self.workdir), self.stdout) |
| 49 | + err = os.path.join(os.path.abspath(self.workdir), self.stderr) |
| 50 | + |
| 51 | + # A duration of zero (the default) means unlimited |
| 52 | + self.fluxjob.duration = self.time_limit or 0 |
| 53 | + self.fluxjob.stdout = out |
| 54 | + self.fluxjob.stderr = err |
| 55 | + self.fluxjob.cwd = os.path.abspath(self.workdir) |
| 56 | + self.fluxjob.environment = dict(os.environ) |
| 57 | + self._completed = False |
| 58 | + |
| 59 | + @property |
| 60 | + def completed(self): |
| 61 | + return self._completed |
| 62 | + |
| 63 | + |
| 64 | +@register_scheduler('flux', error=error) |
| 65 | +class FluxJobScheduler(JobScheduler): |
| 66 | + def __init__(self): |
| 67 | + self._fexecutor = flux.job.FluxExecutor() |
| 68 | + self._submit_timeout = rt.runtime().get_option( |
| 69 | + f'schedulers/@{self.registered_name}/job_submit_timeout' |
| 70 | + ) |
| 71 | + |
| 72 | + def emit_preamble(self, job): |
| 73 | + # We don't need to submit with a file, so we don't need a preamble. |
| 74 | + return [] |
| 75 | + |
| 76 | + def make_job(self, *args, **kwargs): |
| 77 | + return _FluxJob(*args, **kwargs) |
| 78 | + |
| 79 | + def submit(self, job): |
| 80 | + '''Submit a job to the flux executor.''' |
| 81 | + |
| 82 | + flux_future = self._fexecutor.submit(job.fluxjob) |
| 83 | + job._jobid = str(flux_future.jobid()) |
| 84 | + job._submit_time = time.time() |
| 85 | + job._flux_future = flux_future |
| 86 | + |
| 87 | + def cancel(self, job): |
| 88 | + '''Cancel a running Flux job.''' |
| 89 | + |
| 90 | + # Job future cannot cancel once running or completed |
| 91 | + if not job._flux_future.cancel(): |
| 92 | + # This will raise JobException with event=cancel (on poll) |
| 93 | + flux.job.cancel(flux.Flux(), job._flux_future.jobid()) |
| 94 | + |
| 95 | + def poll(self, *jobs): |
| 96 | + '''Poll running Flux jobs for updated states.''' |
| 97 | + |
| 98 | + if jobs: |
| 99 | + # filter out non-jobs |
| 100 | + jobs = [job for job in jobs if job is not None] |
| 101 | + |
| 102 | + if not jobs: |
| 103 | + return |
| 104 | + |
| 105 | + # Loop through active jobs and act on status |
| 106 | + for job in jobs: |
| 107 | + if job._flux_future.done(): |
| 108 | + try: |
| 109 | + # The exit code can help us determine if the job was |
| 110 | + # successful |
| 111 | + exit_code = job._flux_future.result(0) |
| 112 | + except flux.job.JobException: |
| 113 | + # Currently the only state we see is cancelled here |
| 114 | + self.log(f'Job {job.jobid} was likely cancelled.') |
| 115 | + job._state = 'CANCELLED' |
| 116 | + except RuntimeError: |
| 117 | + # Assume some runtime issue (suspended) |
| 118 | + self.log(f'Job {job.jobid} was likely suspended.') |
| 119 | + job._state = 'SUSPENDED' |
| 120 | + else: |
| 121 | + # the job finished (but possibly with nonzero exit code) |
| 122 | + job._state = 'COMPLETED' |
| 123 | + if exit_code != 0: |
| 124 | + self.log( |
| 125 | + f'Job {job.jobid} did not finish successfully' |
| 126 | + ) |
| 127 | + |
| 128 | + job._completed = True |
| 129 | + elif job.state in WAITING_STATES and job.max_pending_time: |
| 130 | + if time.time() - job.submit_time >= job.max_pending_time: |
| 131 | + self.cancel(job) |
| 132 | + job._exception = JobError( |
| 133 | + 'maximum pending time exceeded', job.jobid |
| 134 | + ) |
| 135 | + else: |
| 136 | + # Otherwise, we are still running |
| 137 | + job._state = 'RUNNING' |
| 138 | + |
| 139 | + def allnodes(self): |
| 140 | + raise NotImplementedError('flux backend does not support node listing') |
| 141 | + |
| 142 | + def filternodes(self, job, nodes): |
| 143 | + raise NotImplementedError( |
| 144 | + 'flux backend does not support node filtering' |
| 145 | + ) |
| 146 | + |
| 147 | + def wait(self, job): |
| 148 | + '''Wait until a job is finished.''' |
| 149 | + |
| 150 | + intervals = itertools.cycle([1, 2, 3]) |
| 151 | + while not self.finished(job): |
| 152 | + self.poll(job) |
| 153 | + time.sleep(next(intervals)) |
| 154 | + |
| 155 | + def finished(self, job): |
| 156 | + if job.exception: |
| 157 | + raise job.exception |
| 158 | + |
| 159 | + return job.completed |
0 commit comments