JobBase._execute() previously skipped some important post-job actions when exiting due to kill switch. Those actions have been placed under a finally block so that they are executed by both the "switching" job and the "responding" jobs.

AlexTate · AlexTate · commit abc4c3fdead8 · 2024-08-03T07:11:53.000-07:00
However, some of these post actions added a lot of redundant and unhelpful terminal output when handling jobs killed DUE TO the kill switch. The verbose output obscured the error's cause which isn't helpful. Two new process statuses have been added in order to better handle the event:
- indeterminant: a default value for processStatus.
- killed: the job was killed due to the kill switch being set.

This approach also means that partial outputs aren't collected from jobs that have been killed.
diff --git a/cwltool/job.py b/cwltool/job.py
@@ -298,6 +298,7 @@ def _execute(
                     "{}".format(runtimeContext)
                 )
         outputs: CWLObjectType = {}
+        processStatus = "indeterminate"
         try:
             stdin_path = None
             if self.stdin is not None:
@@ -369,6 +370,7 @@ def stderr_stdout_log_path(
 
             if processStatus != "success":
                 if runtimeContext.kill_switch.is_set():
+                    processStatus = "killed"
                     return
                 elif rcode < 0:
                     _logger.warning(
@@ -412,62 +414,64 @@ def stderr_stdout_log_path(
             _logger.error("[job %s] Job error:\n%s", self.name, str(err))
             processStatus = "permanentFail"
         except WorkflowKillSwitch:
+            processStatus = "permanentFail"
             raise
         except Exception:
             _logger.exception("Exception while running job")
             processStatus = "permanentFail"
-        if (
-            runtimeContext.research_obj is not None
-            and self.prov_obj is not None
-            and runtimeContext.process_run_id is not None
-        ):
-            # creating entities for the outputs produced by each step (in the provenance document)
-            self.prov_obj.record_process_end(
-                str(self.name),
-                runtimeContext.process_run_id,
-                outputs,
-                datetime.datetime.now(),
-            )
-        if processStatus != "success":
-            _logger.warning("[job %s] completed %s", self.name, processStatus)
-        else:
-            _logger.info("[job %s] completed %s", self.name, processStatus)
-
-        if _logger.isEnabledFor(logging.DEBUG):
-            _logger.debug("[job %s] outputs %s", self.name, json_dumps(outputs, indent=4))
-
-        if self.generatemapper is not None and runtimeContext.secret_store is not None:
-            # Delete any runtime-generated files containing secrets.
-            for _, p in self.generatemapper.items():
-                if p.type == "CreateFile":
-                    if runtimeContext.secret_store.has_secret(p.resolved):
-                        host_outdir = self.outdir
-                        container_outdir = self.builder.outdir
-                        host_outdir_tgt = p.target
-                        if p.target.startswith(container_outdir + "/"):
-                            host_outdir_tgt = os.path.join(
-                                host_outdir, p.target[len(container_outdir) + 1 :]
-                            )
-                        os.remove(host_outdir_tgt)
-
-        if runtimeContext.workflow_eval_lock is None:
-            raise WorkflowException("runtimeContext.workflow_eval_lock must not be None")
-
-        if self.output_callback:
-            with runtimeContext.workflow_eval_lock:
-                self.output_callback(outputs, processStatus)
-
-        if runtimeContext.rm_tmpdir and self.stagedir is not None and os.path.exists(self.stagedir):
-            _logger.debug(
-                "[job %s] Removing input staging directory %s",
-                self.name,
-                self.stagedir,
-            )
-            shutil.rmtree(self.stagedir, True)
+        finally:
+            if (
+                runtimeContext.research_obj is not None
+                and self.prov_obj is not None
+                and runtimeContext.process_run_id is not None
+            ):
+                # creating entities for the outputs produced by each step (in the provenance document)
+                self.prov_obj.record_process_end(
+                    str(self.name),
+                    runtimeContext.process_run_id,
+                    outputs,
+                    datetime.datetime.now(),
+                )
+            if processStatus != "success":
+                _logger.warning("[job %s] completed %s", self.name, processStatus)
+            else:
+                _logger.info("[job %s] completed %s", self.name, processStatus)
+
+            if _logger.isEnabledFor(logging.DEBUG):
+                _logger.debug("[job %s] outputs %s", self.name, json_dumps(outputs, indent=4))
+
+            if self.generatemapper is not None and runtimeContext.secret_store is not None:
+                # Delete any runtime-generated files containing secrets.
+                for _, p in self.generatemapper.items():
+                    if p.type == "CreateFile":
+                        if runtimeContext.secret_store.has_secret(p.resolved):
+                            host_outdir = self.outdir
+                            container_outdir = self.builder.outdir
+                            host_outdir_tgt = p.target
+                            if p.target.startswith(container_outdir + "/"):
+                                host_outdir_tgt = os.path.join(
+                                    host_outdir, p.target[len(container_outdir) + 1 :]
+                                )
+                            os.remove(host_outdir_tgt)
+
+            if runtimeContext.workflow_eval_lock is None:
+                raise WorkflowException("runtimeContext.workflow_eval_lock must not be None")
+
+            if self.output_callback:
+                with runtimeContext.workflow_eval_lock:
+                    self.output_callback(outputs, processStatus)
+
+            if runtimeContext.rm_tmpdir and self.stagedir is not None and os.path.exists(self.stagedir):
+                _logger.debug(
+                    "[job %s] Removing input staging directory %s",
+                    self.name,
+                    self.stagedir,
+                )
+                shutil.rmtree(self.stagedir, True)
 
-        if runtimeContext.rm_tmpdir:
-            _logger.debug("[job %s] Removing temporary directory %s", self.name, self.tmpdir)
-            shutil.rmtree(self.tmpdir, True)
+            if runtimeContext.rm_tmpdir:
+                _logger.debug("[job %s] Removing temporary directory %s", self.name, self.tmpdir)
+                shutil.rmtree(self.tmpdir, True)
 
     @abstractmethod
     def _required_env(self) -> Dict[str, str]:
diff --git a/cwltool/workflow.py b/cwltool/workflow.py
@@ -409,12 +409,13 @@ def receive_output(
         processStatus: str,
     ) -> None:
         output = {}
-        for i in self.tool["outputs"]:
-            field = shortname(i["id"])
-            if field in jobout:
-                output[i["id"]] = jobout[field]
-            else:
-                processStatus = "permanentFail"
+        if processStatus != "killed":
+            for i in self.tool["outputs"]:
+                field = shortname(i["id"])
+                if field in jobout:
+                    output[i["id"]] = jobout[field]
+                else:
+                    processStatus = "permanentFail"
         output_callback(output, processStatus)
 
     def job(
diff --git a/cwltool/workflow_job.py b/cwltool/workflow_job.py
@@ -552,24 +552,25 @@ def receive_output(
         jobout: CWLObjectType,
         processStatus: str,
     ) -> None:
-        for i in outputparms:
-            if "id" in i:
-                iid = cast(str, i["id"])
-                if iid in jobout:
-                    self.state[iid] = WorkflowStateItem(i, jobout[iid], processStatus)
-                else:
-                    _logger.error("[%s] Output is missing expected field %s", step.name, iid)
-                    processStatus = "permanentFail"
         if _logger.isEnabledFor(logging.DEBUG):
             _logger.debug("[%s] produced output %s", step.name, json_dumps(jobout, indent=4))
+        if processStatus != "killed":
+            for i in outputparms:
+                if "id" in i:
+                    iid = cast(str, i["id"])
+                    if iid in jobout:
+                        self.state[iid] = WorkflowStateItem(i, jobout[iid], processStatus)
+                    else:
+                        _logger.error("[%s] Output is missing expected field %s", step.name, iid)
+                        processStatus = "permanentFail"
 
-        if processStatus not in ("success", "skipped"):
-            if self.processStatus != "permanentFail":
-                self.processStatus = processStatus
+            if processStatus not in ("success", "skipped"):
+                if self.processStatus != "permanentFail":
+                    self.processStatus = processStatus
 
-            _logger.warning("[%s] completed %s", step.name, processStatus)
-        else:
-            _logger.info("[%s] completed %s", step.name, processStatus)
+                _logger.warning("[%s] completed %s", step.name, processStatus)
+            else:
+                _logger.info("[%s] completed %s", step.name, processStatus)
 
         step.completed = True
         # Release the iterable related to this step to