Skip to content

Commit bec91cf

Browse files
authored
Improved error handling in the PBSProProvider (#3853)
# Description The `PBSProProvider` currently logs submit failures and returns `None` instead of raising an exception as expected. This results in Parsl repeatedly submitting jobs with little feedback to the user when misconfigured. This PR addresses this issue by raising a `SubmitException` which captures the stdout/err from the qsub command. The relevant info trickles up via the `TooManyJobFailuresError` as the examples below show: Error raised when the `PBSProProvider` is misconfigured with a bad queue name: ``` parsl.jobs.errors.TooManyJobFailuresError: Error 1: Failed to start block 0: Cannot launch job parsl.HighThroughputExecutor.block-0.1746723134.8832731: Submit command 'qsub -q debug5 -A AuroraGPT /home/yadunand/parsl/parsl/providers/pbspro/runinfo/001/submit_scripts/parsl.HighThroughputExecutor.block-0.1746723134.8832731' failed; recode=None, stdout=, stderr=qsub: Unknown queue Error 2: Failed to start block 1: Cannot launch job parsl.HighThroughputExecutor.block-1.1746723135.5695264: Submit command 'qsub -q debug5 -A AuroraGPT /home/yadunand/parsl/parsl/providers/pbspro/runinfo/001/submit_scripts/parsl.HighThroughputExecutor.block-1.1746723135.5695264' failed; recode=None, stdout=, stderr=qsub: Unknown queue ``` Error raised when the account is incorrect: ``` parsl.jobs.errors.TooManyJobFailuresError: Error 1: Failed to start block 0: Cannot launch job parsl.HighThroughputExecutor.block-0.1746723277.4842422: Submit command 'qsub -q debug -A NonExistentAccount /home/yadunand/parsl/parsl/providers/pbspro/runinfo/002/submit_scripts/parsl.HighThroughputExecutor.block-0.1746723277.4842422' failed; recode=None, stdout=, stderr=qsub: Request rejected. Reason: not found: Project NonExistentAccount Error 2: Failed to start block 1: Cannot launch job parsl.HighThroughputExecutor.block-1.1746723277.8245358: Submit command 'qsub -q debug -A NonExistentAccount /home/yadunand/parsl/parsl/providers/pbspro/runinfo/002/submit_scripts/parsl.HighThroughputExecutor.block-1.1746723277.8245358' failed; recode=None, stdout=, stderr=qsub: Request rejected. Reason: not found: Project NonExistentAccount ``` # Changed Behaviour `PBSProProvider` when misconfigured will now raise an exception. # Fixes Fixes #3793 ## Type of change Choose which options apply, and delete the ones which do not apply. - Bug fix
1 parent 6e2ebb9 commit bec91cf

File tree

1 file changed

+22
-4
lines changed

1 file changed

+22
-4
lines changed

parsl/providers/pbspro/pbspro.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from parsl.jobs.states import JobState, JobStatus
77
from parsl.launchers import SingleNodeLauncher
8+
from parsl.providers.errors import SubmitException
89
from parsl.providers.pbspro.template import template_string
910
from parsl.providers.torque.torque import TorqueProvider, translate_table
1011

@@ -97,6 +98,14 @@ def _status(self):
9798

9899
retcode, stdout, stderr = self.execute_wait("qstat -f -F json {0}".format(job_id_list))
99100

101+
# If qstat failed do not update job state
102+
if retcode != 0:
103+
logger.warning("qstat failed with retcode:%s STDOUT:%s STDERR:%s",
104+
retcode,
105+
stdout.strip(),
106+
stderr.strip())
107+
return
108+
100109
job_statuses = json.loads(stdout)
101110

102111
if 'Jobs' in job_statuses:
@@ -198,10 +207,19 @@ def submit(self, command, tasks_per_node, job_name="parsl"):
198207
'job_stderr_path': job_stderr_path,
199208
}
200209
else:
201-
message = "Command '{}' failed with return code {}".format(launch_cmd, retcode)
202-
if (stdout is not None) and (stderr is not None):
203-
message += "\nstderr:{}\nstdout{}".format(stderr.strip(), stdout.strip())
204-
logger.error(message)
210+
message = f"Submit command '{launch_cmd}' failed"
211+
logger.error(
212+
f"{message}\n"
213+
f" Return code: {retcode}\n"
214+
f" STDOUT: {stdout.strip()}\n"
215+
f" STDERR: {stderr.strip()}"
216+
)
217+
raise SubmitException(
218+
job_name=job_name,
219+
message=message,
220+
stdout=stdout,
221+
stderr=stderr,
222+
)
205223

206224
return job_id
207225

0 commit comments

Comments
 (0)