Skip to content

Commit 4e7678c

Browse files
committed
Simplified the error logic in launcher/batch scripts.
Since the addition of staging, errors can reasonably occur outside of the launcher as well as inside the launcher and pre- and post- launch scripts. So we treat the output of the whole script (including launcher) as potential indicator of an error.
1 parent 94a2c0e commit 4e7678c

File tree

19 files changed

+33
-118
lines changed

19 files changed

+33
-118
lines changed

src/psij/executors/batch/batch_scheduler_executor.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -573,9 +573,8 @@ def _read_aux_files(self, job: Job, status: JobStatus) -> None:
573573
# already present
574574
out = self._read_aux_file(job, '.out')
575575
if out:
576-
launcher = self._get_launcher_from_job(job)
577-
if launcher.is_launcher_failure(out):
578-
status.message = launcher.get_launcher_failure_message(out)
576+
if '_PSIJ_SCRIPT_DONE' not in out:
577+
status.message = out
579578
logger.debug('Output from launcher: %s', status.message)
580579
else:
581580
self._delete_aux_file(job, '.out')

src/psij/executors/batch/cobalt/cobalt.mustache

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ only results in empty files that are not cleaned up}}
4646
#COBALT -e /dev/null
4747
#COBALT -o /dev/null
4848

49+
{{!redirect output here instead of through #COBALT directive since COBALT_JOB_ID is not available
50+
when the directives are evaluated; the reason for using the job id in the first place being the
51+
same as for the exit code file.}}
52+
exec &>> "{{psij.script_dir}}/$COBALT_JOBID.out"
53+
54+
4955
{{> batch_lib}}
5056

5157
{{!like PBS, this is also cheap and there is not need to check setting}}
@@ -55,11 +61,6 @@ export PSIJ_NODEFILE
5561
{{> stagein}}
5662
update_status ACTIVE
5763

58-
{{!redirect output here instead of through #COBALT directive since COBALT_JOB_ID is not available
59-
when the directives are evaluated; the reason for using the job id in the first place being the
60-
same as for the exit code file.}}
61-
exec &>> "{{psij.script_dir}}/$COBALT_JOBID.out"
62-
6364
{{#psij.launch_command}}{{.}} {{/psij.launch_command}}
6465
_PSIJ_JOB_EC=$?
6566

@@ -68,3 +69,4 @@ _PSIJ_JOB_EC=$?
6869

6970
{{!we redirect to a file tied to the native ID so that we can reach the file with attach().}}
7071
echo $_PSIJ_JOB_EC > "{{psij.script_dir}}/$COBALT_JOBID.ec"
72+
echo "_PSIJ_SCRIPT_DONE"

src/psij/executors/batch/lsf/lsf.mustache

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ only results in empty files that are not cleaned up}}
7171
#BSUB -e /dev/null
7272
#BSUB -o /dev/null
7373

74+
{{!redirect output here instead of through #BSUB directive since LSB_JOBID is not available
75+
when the directives are evaluated; the reason for using the job id in the first place being the
76+
same as for the exit code file.}}
77+
exec &>> "{{psij.script_dir}}/$LSB_JOBID.out"
78+
79+
7480
{{> batch_lib}}
7581

7682
PSIJ_NODEFILE="$LSB_HOSTS"
@@ -79,11 +85,6 @@ export PSIJ_NODEFILE
7985
{{> stagein}}
8086
update_status ACTIVE
8187

82-
{{!redirect output here instead of through #BSUB directive since LSB_JOBID is not available
83-
when the directives are evaluated; the reason for using the job id in the first place being the
84-
same as for the exit code file.}}
85-
exec &>> "{{psij.script_dir}}/$LSB_JOBID.out"
86-
8788
{{#psij.launch_command}}{{.}} {{/psij.launch_command}}
8889
_PSIJ_JOB_EC=$?
8990

@@ -92,3 +93,4 @@ _PSIJ_JOB_EC=$?
9293

9394
{{!we redirect to a file tied to the native ID so that we can reach the file with attach().}}
9495
echo $_PSIJ_JOB_EC > "{{psij.script_dir}}/$LSB_JOBID.ec"
96+
echo "_PSIJ_SCRIPT_DONE"

src/psij/executors/batch/pbs/pbs_classic.mustache

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ only results in empty files that are not cleaned up}}
5555
#PBS -v {{name}}={{value}}
5656
{{/env}}
5757

58+
exec &>> "{{psij.script_dir}}/$PBS_JOBID.out"
59+
5860
{{> batch_lib}}
5961

6062
PSIJ_NODEFILE="$PBS_NODEFILE"
@@ -67,8 +69,6 @@ cd "{{.}}"
6769
{{> stagein}}
6870
update_status ACTIVE
6971

70-
exec &>> "{{psij.script_dir}}/$PBS_JOBID.out"
71-
7272
{{#psij.launch_command}}{{.}} {{/psij.launch_command}}
7373
_PSIJ_JOB_EC=$?
7474

@@ -77,3 +77,4 @@ _PSIJ_JOB_EC=$?
7777

7878
{{!we redirect to a file tied to the native ID so that we can reach the file with attach().}}
7979
echo $_PSIJ_JOB_EC > "{{psij.script_dir}}/$PBS_JOBID.ec"
80+
echo "_PSIJ_SCRIPT_DONE"

src/psij/executors/batch/pbs/pbspro.mustache

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ only results in empty files that are not cleaned up}}
5151
#PBS -e /dev/null
5252
#PBS -o /dev/null
5353

54+
exec &>> "{{psij.script_dir}}/$PBS_JOBID.out"
55+
5456
{{#job.spec.inherit_environment}}
5557
#PBS -V
5658
{{/job.spec.inherit_environment}}
@@ -70,8 +72,6 @@ cd "{{.}}"
7072
{{> stagein}}
7173
update_status ACTIVE
7274

73-
exec &>> "{{psij.script_dir}}/$PBS_JOBID.out"
74-
7575
{{#psij.launch_command}}{{.}} {{/psij.launch_command}}
7676
_PSIJ_JOB_EC=$?
7777

@@ -80,3 +80,4 @@ _PSIJ_JOB_EC=$?
8080

8181
{{!we redirect to a file tied to the native ID so that we can reach the file with attach().}}
8282
echo "$?" > "{{psij.script_dir}}/$PBS_JOBID.ec"
83+
echo "_PSIJ_SCRIPT_DONE"

src/psij/executors/batch/slurm/slurm.mustache

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,12 @@ _PSIJ_PPN={{.}}
8888
{{/processes_per_node}}
8989
{{/job.spec.resources}}
9090

91+
{{!redirect output here instead of through #SBATCH directive since SLURM_JOB_ID is not available
92+
when the directives are evaluated; the reason for using the job id in the first place being the
93+
same as for the exit code file.}}
94+
exec &>> "{{psij.script_dir}}/$SLURM_JOB_ID.out"
95+
96+
9197
{{> batch_lib}}
9298

9399
_PSIJ_NC=`scontrol show hostnames | wc -l`
@@ -111,11 +117,6 @@ export PSIJ_NODEFILE
111117
{{> stagein}}
112118
update_status ACTIVE
113119

114-
{{!redirect output here instead of through #SBATCH directive since SLURM_JOB_ID is not available
115-
when the directives are evaluated; the reason for using the job id in the first place being the
116-
same as for the exit code file.}}
117-
exec &>> "{{psij.script_dir}}/$SLURM_JOB_ID.out"
118-
119120
{{#psij.launch_command}}{{.}} {{/psij.launch_command}}
120121
_PSIJ_JOB_EC=$?
121122

@@ -124,3 +125,4 @@ _PSIJ_JOB_EC=$?
124125

125126
{{!we redirect to a file tied to the native ID so that we can reach the file with attach().}}
126127
echo $_PSIJ_JOB_EC > "{{psij.script_dir}}/$SLURM_JOB_ID.ec"
128+
echo "_PSIJ_SCRIPT_DONE"

src/psij/executors/local.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,8 @@ def _process_done(self, p: _ProcessEntry) -> None:
335335
# the exit code of the launcher is the exit code of the job, we must use a different
336336
# mechanism to distinguish between job errors and launcher errors. So we delegate to
337337
# the launcher implementation to figure out if the error belongs to the job or not
338-
if p.launcher and p.out and p.launcher.is_launcher_failure(p.out):
339-
message = p.launcher.get_launcher_failure_message(p.out)
338+
if p.out and '_PSIJ_SCRIPT_DONE' not in p.out:
339+
message = p.out
340340
state = JobState.FAILED
341341

342342
# We need to ensure that the status updater has processed all updates that

src/psij/executors/local/local.mustache

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,5 @@ set -e
2222
{{> stageout}}
2323
{{> cleanup}}
2424

25+
echo "_PSIJ_SCRIPT_DONE"
2526
exit "$_PSIJ_JOB_EC"

src/psij/job_launcher.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,43 +34,6 @@ def get_launch_command(self, job: Job) -> List[str]:
3434
"""
3535
pass
3636

37-
@abstractmethod
38-
def is_launcher_failure(self, output: str) -> bool:
39-
"""
40-
Determines whether the launcher invocation output contains a launcher failure or not.
41-
42-
Parameters
43-
----------
44-
output
45-
The output (combined stdout/stderr) from an invocation of the launcher command
46-
47-
Returns
48-
-------
49-
Returns `True` if the `output` parameter contains a string that represents a launncher
50-
failure.
51-
"""
52-
pass
53-
54-
@abstractmethod
55-
def get_launcher_failure_message(self, output: str) -> str:
56-
"""
57-
Extracts the launcher error message from the output of this launcher's invocation.
58-
59-
It is understood that the value of the `output` parameter is such that
60-
:meth:`is_launcher_failure` returns `True` on it.
61-
62-
Parameters
63-
----------
64-
output
65-
The output (combined stdout/stderr) from an invocation of the launcher command.
66-
67-
Returns
68-
-------
69-
A string representing the part of the launcher output that describes the launcher
70-
error.
71-
"""
72-
pass
73-
7437
@staticmethod
7538
def get_instance(name: str, version_constraint: Optional[str] = None,
7639
config: Optional[JobExecutorConfig] = None) -> 'Launcher':

src/psij/launcher.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -36,44 +36,6 @@ def get_launch_command(self, job: Job) -> List[str]:
3636
"""
3737
pass
3838

39-
@abstractmethod
40-
def is_launcher_failure(self, output: str) -> bool:
41-
"""
42-
Determines whether the launcher invocation output contains a launcher failure or not.
43-
44-
Parameters
45-
----------
46-
output
47-
The output (combined stdout/stderr) from an invocation of the launcher command
48-
49-
Returns
50-
-------
51-
Returns `True` if the output of the launcher indicates that it has exited with a
52-
non-zero exit code due to an error occurring in the launcher.
53-
54-
"""
55-
pass
56-
57-
@abstractmethod
58-
def get_launcher_failure_message(self, output: str) -> str:
59-
"""
60-
Extracts the launcher error message from the output of this launcher's invocation.
61-
62-
It is understood that the output is such that
63-
:func:`~psij.launcher.Launcher.is_launcher_failure` returns `True` on it.
64-
65-
Parameters
66-
----------
67-
output
68-
The output (combined stdout/stderr) from an invocation of the launcher command.
69-
70-
Returns
71-
-------
72-
A string representing the part of the launcher output that describes the launcher
73-
error.
74-
"""
75-
pass
76-
7739
@staticmethod
7840
def get_instance(name: str, version_constraint: Optional[str] = None,
7941
config: Optional[JobExecutorConfig] = None) -> 'Launcher':

0 commit comments

Comments
 (0)