InformaticsMatters · alanbchristie · Sep 2, 2025 · Jul 18, 2025 · Aug 8, 2025 · Aug 15, 2025
diff --git a/README.rst b/README.rst
@@ -38,7 +38,7 @@ The project's written in Python and uses `Poetry`_ for dependency and package
 management. We also use `pre-commit`_ to manage our pre-commit hooks, which
 rely on `black`_, `mypy`_, `pylint`_, amongst others.
 
-From within a VS Code `devcontainer`_] environment (recommended)::
+From within a VS Code `devcontainer`_ environment (recommended)::
 
     poetry install --with dev --sync
     pre-commit install -t commit-msg -t pre-commit

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ packages = [
 [tool.poetry.dependencies]
 python = "^3.12"
 im-protobuf = "^8.2.0"
-im-data-manager-job-decoder = "^2.1.0"
+im-data-manager-job-decoder = "^2.5.0"
 jsonschema = "^4.21.1"
 pyyaml = ">= 5.3.1, < 7.0"
 

diff --git a/tests/instance_launcher.py b/tests/instance_launcher.py
@@ -68,18 +68,32 @@ def __init__(
                 elif os.path.isdir(file_path):
                     shutil.rmtree(file_path)
 
-    def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
+    def launch(self, *, launch_parameters: LaunchParameters) -> LaunchResult:
         assert launch_parameters
         assert launch_parameters.project_id == TEST_PROJECT_ID
         assert launch_parameters.specification
         assert isinstance(launch_parameters.specification, dict)
 
         os.makedirs(EXECUTION_DIRECTORY, exist_ok=True)
 
-        # Create an Instance record (and dummy Task ID)
-        response = self._api_adapter.create_instance(
-            running_workflow_step_id=launch_parameters.running_workflow_step_id
+        # Create a running workflow step
+        assert launch_parameters.running_workflow_id
+        assert launch_parameters.step_name
+        response, _ = self._api_adapter.create_running_workflow_step(
+            running_workflow_id=launch_parameters.running_workflow_id,
+            step=launch_parameters.step_name,
+            replica=launch_parameters.step_replication_number,
         )
+        assert "id" in response
+        rwfs_id: str = response["id"]
+        # And add the variables we've been provided with
+        if launch_parameters.variables:
+            _ = self._api_adapter.set_running_workflow_step_variables(
+                running_workflow_step_id=rwfs_id, variables=launch_parameters.variables
+            )
+
+        # Create an Instance record (and dummy Task ID)
+        response = self._api_adapter.create_instance(running_workflow_step_id=rwfs_id)
         instance_id = response["id"]
         task_id = "task-00000000-0000-0000-0000-000000000001"
 
@@ -96,8 +110,8 @@ def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
         # The command may not need any, but we do the decoding anyway.
         decoded_command, status = job_decoder.decode(
             job["command"],
-            launch_parameters.specification_variables,
-            launch_parameters.running_workflow_step_id,
+            launch_parameters.variables,
+            rwfs_id,
             TextEncoding.JINJA2_3_0,
         )
         print(f"Decoded command: {decoded_command}")
@@ -129,6 +143,7 @@ def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
         self._msg_dispatcher.send(pod_message)
 
         return LaunchResult(
+            running_workflow_step_id=rwfs_id,
             instance_id=instance_id,
             task_id=task_id,
             command=" ".join(subprocess_cmd),

diff --git a/tests/job-definitions/job-definitions.yaml b/tests/job-definitions/job-definitions.yaml
@@ -132,3 +132,14 @@ jobs:
   concatenate:
     command: >-
       concatenate.py {% for ifile in  inputFile %}{{ ifile }} {% endfor %} --outputFile {{ outputFile }}
+
+  splitsmiles:
+    command: >-
+      copyf.py {{ inputFile }}
+    # Simulate multiple output files...
+    variables:
+      outputs:
+        properties:
+          outputBase:
+            creates: '{{ outputBase }}_*.smi'
+            type: files
diff --git a/tests/jobs/copyf.py b/tests/jobs/copyf.py
@@ -0,0 +1,30 @@
+import shutil
+import sys
+from pathlib import Path
+
+
+def main():
+    print("copyf job runnint")
+    if len(sys.argv) != 2:
+        print("Usage: python copy_file.py <filename>")
+        sys.exit(1)
+
+    original_path = Path(sys.argv[1])
+
+    if not original_path.exists() or not original_path.is_file():
+        print(f"Error: '{original_path}' does not exist or is not a file.")
+        sys.exit(1)
+
+    # Create a new filename like 'example_copy.txt'
+    new_name = original_path.absolute().parent.joinpath("chunk_1.smi")
+    new_path = original_path.with_name(new_name.name)
+    shutil.copyfile(original_path, new_path)
+
+    new_name = original_path.absolute().parent.joinpath("chunk_2.smi")
+    new_path = original_path.with_name(new_name.name)
+
+    shutil.copyfile(original_path, new_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/jobs/copyf.sh b/tests/jobs/copyf.sh
@@ -0,0 +1,4 @@
+#! /bin/bash
+
+cp "$1" chunk_1.smi
+cp "$1" chunk_2.smi
diff --git a/tests/jobs/split-smi.sh b/tests/jobs/split-smi.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+set -euo pipefail
+
+if [[ $# -lt 3 || $# -gt 4 ]]; then
+  echo "Usage: $0 <input_file(.smi or .smi.gz)> <lines_per_file> <output_basename> [has_header: yes]"
+  exit 1
+fi
+
+input_file="$1"
+lines_per_file="$2"
+base_name="$3"
+has_header="${4:-no}"
+
+# Determine how to read the file (plain text or gzipped)
+if [[ "$input_file" == *.gz ]]; then
+  reader="zcat"
+else
+  reader="cat"
+fi
+
+if ! [[ -f "$input_file" ]]; then
+  echo "Error: File '$input_file' not found"
+  exit 1
+fi
+
+# Extract header if present
+if [[ "$has_header" == "yes" ]]; then
+  header="$($reader "$input_file" | head -n1)"
+  data_start=2
+else
+  header=""
+  data_start=1
+fi
+
+# Count number of data lines (excluding header if present)
+data_lines="$($reader "$input_file" | tail -n +"$data_start" | wc -l)"
+if [[ "$data_lines" -eq 0 ]]; then
+  echo "No data lines to process."
+  exit 0
+fi
+
+# Calculate number of output files and required zero padding
+num_files=$(( (data_lines + lines_per_file - 1) / lines_per_file ))
+pad_width=0
+if [[ "$num_files" -gt 1 ]]; then
+  pad_width=${#num_files}
+fi
+
+# Split logic
+$reader "$input_file" | tail -n +"$data_start" | awk -v header="$header" -v lines="$lines_per_file" -v base="$base_name" -v pad="$pad_width" '
+function new_file() {
+  suffix = (pad > 0) ? sprintf("%0*d", pad, file_index) : file_index
+  file = base "_" suffix ".smi"
+  if (header != "") {
+    print header > file
+  }
+  file_index++
+  line_count = 0
+}
+{
+  if (line_count == 0) {
+    new_file()
+  }
+  print >> file
+  line_count++
+  if (line_count == lines) {
+    close(file)
+    print file " created"
+    line_count = 0
+  }
+}
+' file_index=1