Adding a unit test. Two outcomes are measured:

AlexTate · mr-c · commit a1ab910f41bf · 2024-11-13T18:06:25.000+01:00
1) Once a job has been terminated, all other parallel jobs should also terminate. In this test, the runtime of the workflow indicates whether the kill switch has been handled correctly. If the kill switch is successful then the workflow's runtime should be significantly shorter than sleep_time.

2) Outputs produced by a successful step should still be collected. In this case, the completed step is make_array.

To be frank, this test could be simplified by using a ToolTimeLimit requirement rather than process_roulette.cwl
diff --git a/tests/process_roulette.cwl b/tests/process_roulette.cwl
@@ -0,0 +1,35 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+
+doc: |
+  This tool selects a random process whose associated command matches 
+  search_str, terminates it, and reports the PID of the terminated process. 
+  The search_str supports regex. Example search_strs:
+  - "sleep"
+  - "sleep 33"
+  - "sleep [0-9]+"
+
+
+baseCommand: [ 'bash', '-c' ]
+arguments:
+  - |
+    sleep $(inputs.delay)
+    pid=\$(ps -ef | grep '$(inputs.search_str)' | grep -v grep | awk '{print $2}' | shuf | head -n 1)
+    echo "$pid" | tee >(xargs kill -SIGTERM)
+inputs:
+  search_str:
+    type: string
+  delay:
+    type: int?
+    default: 3
+stdout: "pid.txt"
+outputs:
+  pid:
+    type: string
+    outputBinding:
+      glob: pid.txt
+      loadContents: true
+      outputEval: $(self[0].contents)
diff --git a/tests/test_parallel.py b/tests/test_parallel.py
@@ -1,9 +1,10 @@
 import json
+import time
 from pathlib import Path
 
 from cwltool.context import RuntimeContext
 from cwltool.executors import MultithreadedJobExecutor
-from cwltool.factory import Factory
+from cwltool.factory import Factory, WorkflowStatus
 
 from .util import get_data, needs_docker
 
@@ -29,3 +30,23 @@ def test_scattered_workflow() -> None:
     echo = factory.make(get_data(test_file))
     with open(get_data(job_file)) as job:
         assert echo(**json.load(job)) == {"out": ["foo one three", "foo two four"]}
+
+
+def test_on_error_kill() -> None:
+    test_file = "tests/wf/on-error_kill.cwl"
+    runtime_context = RuntimeContext()
+    runtime_context.on_error = "kill"
+    factory = Factory(MultithreadedJobExecutor(), None, runtime_context)
+    ks_test = factory.make(get_data(test_file))
+
+    # arbitrary test values
+    sleep_time = 33  # a "sufficiently large" timeout
+    n_sleepers = 5
+
+    try:
+        start_time = time.time()
+        ks_test(sleep_time=sleep_time)
+    except WorkflowStatus as e:
+        assert e.out == {"instructed_sleep_times": [sleep_time] * n_sleepers}
+        assert time.time() - start_time < sleep_time
+        print("sharty barty")
diff --git a/tests/wf/on-error_kill.cwl b/tests/wf/on-error_kill.cwl
@@ -0,0 +1,93 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: Workflow
+requirements:
+  ScatterFeatureRequirement: {}
+  InlineJavascriptRequirement: {}
+  StepInputExpressionRequirement: {}
+
+
+doc: |
+  This workflow tests the optional argument --on-error kill.
+  MultithreadedJobExecutor() or --parallel should be used.
+  A successful run should:
+    1) Finish in (much) less than sleep_time seconds.
+    2) Return outputs produced by successful steps.
+
+
+inputs:
+  sleep_time: { type: int, default: 33 }
+  n_sleepers: { type: int?, default: 5 }
+
+
+steps:
+  make_array:
+    doc: |
+      This step produces an array of sleep_time values to be used
+      as inputs for the scatter_step. The array also serves as the
+      workflow output which should be collected despite the 
+      kill switch triggered in the kill step below.
+    in: { sleep_time: sleep_time, n_sleepers: n_sleepers }
+    out: [ times ]
+    run:
+      class: ExpressionTool
+      inputs:
+          sleep_time: { type: int }
+          n_sleepers: { type: int }
+      outputs: { times: { type: "int[]" } }
+      expression: |
+        ${ return {"times": Array(inputs.n_sleepers).fill(inputs.sleep_time)} }
+
+  scatter_step:
+    doc: |
+      This step starts several parallel jobs that each sleep for
+      sleep_time seconds.
+    in:
+      time: make_array/times
+    scatter: time
+    out: [ ]
+    run:
+      class: CommandLineTool
+      baseCommand: sleep
+      inputs:
+        time: { type: int, inputBinding: { position: 1 } }
+      outputs: { }
+
+  kill:
+    doc: |
+      This step waits a few seconds and selects a random scatter_step job to kill. 
+      When `--on-error kill` is used, the runner should respond by terminating all 
+      remaining jobs and exiting. This means the workflow's overall runtime should be 
+      much less than max(sleep_time). The input force_upstream_order ensures that 
+      this step runs after make_array, and therefore roughly parallel to scatter_step.
+    in:
+      force_upstream_order: make_array/times
+      sleep_time: sleep_time
+      search_str:
+        valueFrom: $("sleep " + inputs.sleep_time)
+    out: [ pid ]
+    run: ../process_roulette.cwl
+
+  dangling_step:
+    doc: |
+      This step should never run. It confirms that additional jobs aren't
+      submitted and allowed to run to completion after the kill switch has
+      been set. The input force_downstream_order ensures that this step runs
+      after the kill step.
+    in:
+      force_downstream_order: kill/pid
+      time: sleep_time
+    out: [ ]
+    run:
+      class: CommandLineTool
+      baseCommand: sleep
+      inputs:
+        time: { type: int, inputBinding: { position: 1 } }
+      outputs: { }
+
+
+outputs:
+  instructed_sleep_times:
+    type: int[]
+    outputSource: make_array/times