|
20 | 20 | import reframe.utility.osext as osext |
21 | 21 | from reframe.core.backends import register_scheduler |
22 | 22 | from reframe.core.config import settings |
23 | | -from reframe.core.exceptions import JobSchedulerError |
| 23 | +from reframe.core.exceptions import (JobError, JobSchedulerError) |
24 | 24 | from reframe.utility import seconds_to_hms |
25 | 25 |
|
26 | 26 |
|
|
40 | 40 | _run_strict = functools.partial(osext.run_command, check=True) |
41 | 41 |
|
42 | 42 |
|
| 43 | +JOB_STATES = { |
| 44 | + 'Q': 'QUEUED', |
| 45 | + 'H': 'HELD', |
| 46 | + 'R': 'RUNNING', |
| 47 | + 'E': 'EXITING', |
| 48 | + 'T': 'MOVED', |
| 49 | + 'W': 'WAITING', |
| 50 | + 'S': 'SUSPENDED', |
| 51 | + 'C': 'COMPLETED', |
| 52 | +} |
| 53 | + |
| 54 | + |
43 | 55 | class _PbsJob(sched.Job): |
44 | 56 | def __init__(self, *args, **kwargs): |
45 | 57 | super().__init__(*args, **kwargs) |
@@ -156,24 +168,104 @@ def finished(self, job): |
156 | 168 |
|
157 | 169 | return job.completed |
158 | 170 |
|
159 | | - def _poll_job(self, job): |
160 | | - if job is None: |
| 171 | + def _update_nodelist(self, job, nodespec): |
| 172 | + if job.nodelist is not None: |
161 | 173 | return |
162 | 174 |
|
163 | | - with osext.change_dir(job.workdir): |
164 | | - output_ready = (os.path.exists(job.stdout) and |
165 | | - os.path.exists(job.stderr)) |
| 175 | + job._nodelist = [x.split('/')[0] for x in nodespec.split('+')] |
| 176 | + job._nodelist.sort() |
166 | 177 |
|
167 | | - done = job.cancelled or output_ready |
168 | | - if done: |
169 | | - t_now = time.time() |
170 | | - if job.completion_time is None: |
171 | | - job._completion_time = t_now |
| 178 | + def poll(self, *jobs): |
| 179 | + if jobs: |
| 180 | + # Filter out non-jobs |
| 181 | + jobs = [job for job in jobs if job is not None] |
172 | 182 |
|
173 | | - time_from_finish = t_now - job.completion_time |
174 | | - if time_from_finish > PBS_OUTPUT_WRITEBACK_WAIT: |
175 | | - job._completed = True |
| 183 | + if not jobs: |
| 184 | + return |
| 185 | + |
| 186 | + completed = osext.run_command( |
| 187 | + f'qstat -f {" ".join(job.jobid for job in jobs)}' |
| 188 | + ) |
| 189 | + |
| 190 | + # Depending on the configuration, completed jobs will remain on the job |
| 191 | + # list for a limited time, or be removed upon completion. |
| 192 | + # If qstat cannot find any of the job IDs, it will return 153. |
| 193 | + # Otherwise, it will return with return code 0 and print information |
| 194 | + # only for the jobs it could find. |
| 195 | + if completed.returncode == 153: |
| 196 | + self.log('Return code is 153: jobids not known by scheduler, ' |
| 197 | + 'assuming all jobs completed') |
| 198 | + for job in jobs: |
| 199 | + job._state = 'COMPLETED' |
| 200 | + |
| 201 | + return |
| 202 | + |
| 203 | + if completed.returncode != 0: |
| 204 | + raise JobSchedulerError( |
| 205 | + f'qstat failed with exit code {completed.returncode} ' |
| 206 | + f'(standard error follows):\n{completed.stderr}' |
| 207 | + ) |
| 208 | + |
| 209 | + # Store information for each job separately |
| 210 | + jobinfo = {} |
| 211 | + for job_raw_info in completed.stdout.split('\n\n'): |
| 212 | + jobid_match = re.search( |
| 213 | + r'^Job Id:\s*(?P<jobid>\S+)', job_raw_info, re.MULTILINE |
| 214 | + ) |
| 215 | + if jobid_match: |
| 216 | + jobid = jobid_match.group('jobid') |
| 217 | + jobinfo[jobid] = job_raw_info |
176 | 218 |
|
177 | | - def poll(self, *jobs): |
178 | 219 | for job in jobs: |
179 | | - self._poll_job(job) |
| 220 | + if job.jobid not in jobinfo: |
| 221 | + self.log(f'Job {job.jobid} not known to scheduler, ' |
| 222 | + f'assuming job completed') |
| 223 | + job._state = 'COMPLETED' |
| 224 | + job._completed = True |
| 225 | + continue |
| 226 | + |
| 227 | + info = jobinfo[job.jobid] |
| 228 | + state_match = re.search( |
| 229 | + r'^\s*job_state = (?P<state>[A-Z])', info, re.MULTILINE |
| 230 | + ) |
| 231 | + if not state_match: |
| 232 | + self.log(f'Job state not found (job info follows):\n{info}') |
| 233 | + continue |
| 234 | + |
| 235 | + state = state_match.group('state') |
| 236 | + job._state = JOB_STATES[state] |
| 237 | + nodelist_match = re.search( |
| 238 | + r'exec_host = (?P<nodespec>[\S\t\n]+)', |
| 239 | + info, re.MULTILINE |
| 240 | + ) |
| 241 | + if nodelist_match: |
| 242 | + nodespec = nodelist_match.group('nodespec') |
| 243 | + nodespec = re.sub(r'[\n\t]*', '', nodespec) |
| 244 | + self._update_nodelist(job, nodespec) |
| 245 | + |
| 246 | + if job.state == 'COMPLETED': |
| 247 | + exitcode_match = re.search( |
| 248 | + r'^\s*exit_status = (?P<code>\d+)', |
| 249 | + info, re.MULTILINE, |
| 250 | + ) |
| 251 | + if exitcode_match: |
| 252 | + job._exitcode = int(exitcode_match.group('code')) |
| 253 | + |
| 254 | + # We report a job as finished only when its stdout/stderr are |
| 255 | + # written back to the working directory |
| 256 | + stdout = os.path.join(job.workdir, job.stdout) |
| 257 | + stderr = os.path.join(job.workdir, job.stderr) |
| 258 | + out_ready = os.path.exists(stdout) and os.path.exists(stderr) |
| 259 | + done = job.cancelled or out_ready |
| 260 | + if done: |
| 261 | + job._completed = True |
| 262 | + elif (job.state in ['QUEUED', 'HELD', 'WAITING'] and |
| 263 | + job.max_pending_time): |
| 264 | + if (time.time() - job.submit_time >= job.max_pending_time): |
| 265 | + self.cancel(job) |
| 266 | + job._exception = JobError('maximum pending time exceeded') |
| 267 | + |
| 268 | + |
| 269 | +@register_scheduler('torque') |
| 270 | +class TorqueJobScheduler(PbsJobScheduler): |
| 271 | + TASKS_OPT = '-l nodes={num_nodes}:ppn={num_cpus_per_node}' |
0 commit comments