Skip to content

Commit 0a6decc

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2519 from ekouts/bugfix/pbs_output_files
[bugfix] Check that PBS output is written back to working directory before setting the job as completed
2 parents 681440b + 135ed0d commit 0a6decc

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

reframe/core/schedulers/pbs.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,13 @@ def _update_nodelist(self, job, nodespec):
181181
job._nodelist.sort()
182182

183183
def poll(self, *jobs):
184+
def output_ready(job):
185+
# We report a job as finished only when its stdout/stderr are
186+
# written back to the working directory
187+
stdout = os.path.join(job.workdir, job.stdout)
188+
stderr = os.path.join(job.workdir, job.stderr)
189+
return os.path.exists(stdout) and os.path.exists(stderr)
190+
184191
if jobs:
185192
# Filter out non-jobs
186193
jobs = [job for job in jobs if job is not None]
@@ -198,11 +205,12 @@ def poll(self, *jobs):
198205
# Otherwise, it will return with return code 0 and print information
199206
# only for the jobs it could find.
200207
if completed.returncode in (153, 35):
201-
self.log(f'Return code is {completed.returncode}: '
202-
f'assuming all jobs completed')
208+
self.log(f'Return code is {completed.returncode}')
203209
for job in jobs:
204210
job._state = 'COMPLETED'
205-
job._completed = True
211+
if job.cancelled or output_ready(job):
212+
self.log(f'Assuming job {job.jobid} completed')
213+
job._completed = True
206214

207215
return
208216

@@ -224,10 +232,12 @@ def poll(self, *jobs):
224232

225233
for job in jobs:
226234
if job.jobid not in jobinfo:
227-
self.log(f'Job {job.jobid} not known to scheduler, '
228-
f'assuming job completed')
235+
self.log(f'Job {job.jobid} not known to scheduler')
229236
job._state = 'COMPLETED'
230-
job._completed = True
237+
if job.cancelled or output_ready(job):
238+
self.log(f'Assuming job {job.jobid} completed')
239+
job._completed = True
240+
231241
continue
232242

233243
info = jobinfo[job.jobid]
@@ -259,10 +269,7 @@ def poll(self, *jobs):
259269

260270
# We report a job as finished only when its stdout/stderr are
261271
# written back to the working directory
262-
stdout = os.path.join(job.workdir, job.stdout)
263-
stderr = os.path.join(job.workdir, job.stderr)
264-
out_ready = os.path.exists(stdout) and os.path.exists(stderr)
265-
done = job.cancelled or out_ready
272+
done = job.cancelled or output_ready(job)
266273
if done:
267274
job._completed = True
268275
elif (job.state in ['QUEUED', 'HELD', 'WAITING'] and

0 commit comments

Comments
 (0)