InformaticsMatters · achristie-im · Dec 3, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/.cz.yaml b/.cz.yaml
@@ -9,8 +9,8 @@
 commitizen:
   name: cz_customize
   customize:
-    schema_pattern: "^(?P<change_type>feat|fix|perf|refactor|remove|style|test|build|docs|chore|ci|BREAKING CHANGE)(?:\\((?P<scope>[^()\\r\\n]*)\\)|\\()?(?P<breaking>!)?:\\s(?P<message>.*)?"
-    commit_parser: "^(?P<change_type>feat|fix|perf|refactor|remove|style|test|build|docs|chore|ci|BREAKING CHANGE)(?:\\((?P<scope>[^()\\r\\n]*)\\)|\\()?(?P<breaking>!)?:\\s(?P<message>.*)?"
+    schema_pattern: "^(?P<change_type>feat|fix|perf|refactor|remove|style|test|build|docs|chore|ci|dev|BREAKING CHANGE)(?:\\((?P<scope>[^()\\r\\n]*)\\)|\\()?(?P<breaking>!)?:\\s(?P<message>.*)?"
+    commit_parser: "^(?P<change_type>feat|fix|perf|refactor|remove|style|test|build|docs|chore|ci|dev|BREAKING CHANGE)(?:\\((?P<scope>[^()\\r\\n]*)\\)|\\()?(?P<breaking>!)?:\\s(?P<message>.*)?"
     # The changelog_pattern identifies the commit types
     # that will be included.
     # Build the changelog with 'cz ch' on the staging or production branches.

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,41 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "WorkflowEngine Python 3.13",
+	"image": "mcr.microsoft.com/devcontainers/python:1-3.13-bullseye",
+	"features": {
+		"ghcr.io/devcontainers/features/git:1": {
+			"ppa": true,
+			"version": "os-provided"
+		}
+	},
+  // We mount bash history in an attempt to preserver history
+  // between container restarts
+  // (see https://code.visualstudio.com/remote/advancedcontainers/persist-bash-history)
+  "mounts": [
+      "source=projectname-bashhistory,target=/commandhistory,type=volume"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "codezombiech.gitignore",
+        "donjayamanne.githistory",
+        "donjayamanne.git-extension-pack",
+        "eamodio.gitlens",
+        "github.vscode-github-actions",
+        "ms-kubernetes-tools.vscode-kubernetes-tools",
+        "ms-python.vscode-pylance",
+        "sourcery.sourcery",
+        "streetsidesoftware.code-spell-checker",
+        "trond-snekvik.simple-rst",
+        "vivaxy.vscode-conventional-commits",
+        "yzhang.markdown-all-in-one"
+      ]
+    }
+  },
+  "postCreateCommand": {
+		"Install Python requirements": "pip3 install --user -r requirements.txt",
+    "Fix Volume Permissions": "sudo chown -R $(whoami): /commandhistory"
+	},
+  "forwardPorts": []
+}
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ dist/
 **/__pycache__/
 **/*.pickle
 tests/project-root/project-*/
+**/.DS_Store
 
 # temp files
 *~

diff --git a/README.rst b/README.rst
@@ -38,10 +38,9 @@ The project's written in Python and uses `Poetry`_ for dependency and package
 management. We also use `pre-commit`_ to manage our pre-commit hooks, which
 rely on `black`_, `mypy`_, `pylint`_, amongst others.
 
-Create your environment::
+From within a VS Code `devcontainer`_ environment (recommended)::
 
-    poetry shell
-    poetry install --with dev
+    poetry install --with dev --sync
     pre-commit install -t commit-msg -t pre-commit
 
 And then start by running the pre-commit hooks to ensure you're stating with a
@@ -51,9 +50,10 @@ _clean_ project::
 
 And then run the tests::
 
-    coverage run -m pytest
-    coverage report
+    poetry run coverage run -m pytest
+    poetry run coverage report
 
+.. _devcontainer: https://code.visualstudio.com/docs/devcontainers/containers
 .. _Poetry: https://python-poetry.org
 .. _pre-commit: https://pre-commit.com
 .. _black: https://github.com/psf/black

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ packages = [
 [tool.poetry.dependencies]
 python = "^3.12"
 im-protobuf = "^8.2.0"
-im-data-manager-job-decoder = "^2.1.0"
+im-data-manager-job-decoder = "^2.5.0"
 jsonschema = "^4.21.1"
 pyyaml = ">= 5.3.1, < 7.0"
 

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+poetry == 1.8.5
+pre-commit == 4.2.0
diff --git a/tests/instance_launcher.py b/tests/instance_launcher.py
@@ -68,39 +68,64 @@ def __init__(
                 elif os.path.isdir(file_path):
                     shutil.rmtree(file_path)
 
-    def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
+    def launch(self, *, launch_parameters: LaunchParameters) -> LaunchResult:
         assert launch_parameters
         assert launch_parameters.project_id == TEST_PROJECT_ID
         assert launch_parameters.specification
         assert isinstance(launch_parameters.specification, dict)
 
         os.makedirs(EXECUTION_DIRECTORY, exist_ok=True)
 
-        # We're passed a RunningWorkflowStep ID but a record is expected to have been
-        # created bt the caller, we simply create instance records.
-        response, _ = self._api_adapter.get_running_workflow_step(
-            running_workflow_step_id=launch_parameters.running_workflow_step_id
-        )
-        # Now simulate the creation of a Task and Instance record
-        response = self._api_adapter.create_instance(
-            running_workflow_step_id=launch_parameters.running_workflow_step_id
-        )
+        if launch_parameters.step_replication_number:
+            assert (
+                launch_parameters.step_replication_number
+                <= launch_parameters.total_number_of_replicas
+            )
+
+        # Create an Instance record (and dummy Task ID)
+        response = self._api_adapter.create_instance()
         instance_id = response["id"]
         task_id = "task-00000000-0000-0000-0000-000000000001"
 
-        # Apply variables to the step's Job command.
+        # Create a running workflow step
+        assert launch_parameters.running_workflow_id
+        assert launch_parameters.step_name
+        response, _ = self._api_adapter.create_running_workflow_step(
+            running_workflow_id=launch_parameters.running_workflow_id,
+            step=launch_parameters.step_name,
+            instance_id=instance_id,
+            replica=launch_parameters.step_replication_number,
+            replicas=launch_parameters.total_number_of_replicas,
+        )
+        assert "id" in response
+        rwfs_id: str = response["id"]
+        # And add the variables we've been provided with
+        if launch_parameters.variables:
+            _ = self._api_adapter.set_running_workflow_step_variables(
+                running_workflow_step_id=rwfs_id, variables=launch_parameters.variables
+            )
+
+        # Now add the running workflow ID ot the instance record.
+        self._api_adapter.set_instance_running_workflow_step_id(
+            instance_id=instance_id,
+            running_workflow_step_id=rwfs_id,
+        )
+
+        # Get the job defitnion.
+        # This is expected to exist in the tests/job-definitions directory.
         job, _ = self._api_adapter.get_job(
             collection=launch_parameters.specification["collection"],
             job=launch_parameters.specification["job"],
             version="do-not-care",
         )
         assert job
 
-        # Now apply the variables to the command
+        # Now apply the provided variables to the command.
+        # The command may not need any, but we do the decoding anyway.
         decoded_command, status = job_decoder.decode(
             job["command"],
-            launch_parameters.specification_variables,
-            launch_parameters.running_workflow_step_id,
+            launch_parameters.variables,
+            rwfs_id,
             TextEncoding.JINJA2_3_0,
         )
         print(f"Decoded command: {decoded_command}")
@@ -132,6 +157,7 @@ def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
         self._msg_dispatcher.send(pod_message)
 
         return LaunchResult(
+            running_workflow_step_id=rwfs_id,
             instance_id=instance_id,
             task_id=task_id,
             command=" ".join(subprocess_cmd),

diff --git a/tests/job-definitions/job-definitions.yaml b/tests/job-definitions/job-definitions.yaml
@@ -131,4 +131,36 @@ jobs:
 
   concatenate:
     command: >-
-      concatenate.py {% for ifile in  inputFile %}{{ ifile }} {% endfor %} --outputFile {{ outputFile }}
+      concatenate.py --inputFile {{ inputFile }} --outputFile {{ outputFile }}
+    # Simulate a multiple input files Job (combiner)...
+    variables:
+      inputs:
+        properties:
+          inputFile:
+            type: files
+      options:
+        type: object
+        properties:
+          inputDirPrefix:
+            title: Optional inoput directory prefix
+            type: string
+      outputs:
+        properties:
+          outputBase:
+            creates: '{{ outputFile }}'
+            type: file
+
+  splitsmiles:
+    command: >-
+      copyf.py {{ inputFile }}
+    # Simulate a multiple output files Job (splitetr)...
+    variables:
+      inputs:
+        properties:
+          inputFile:
+            type: file
+      outputs:
+        properties:
+          outputBase:
+            creates: '{{ outputBase }}_*.smi'
+            type: files
diff --git a/tests/jobs/concatenate.py b/tests/jobs/concatenate.py
@@ -2,13 +2,16 @@
 
 parser = argparse.ArgumentParser(
     prog="addcol",
-    description="Takes a list of files and writes them into single outputfile",
+    description="Takes an optional directory prefix and a file,"
+    " and combines all the input files that are found"
+    " into single outputfile",
 )
-parser.add_argument("inputFile", nargs="+", type=argparse.FileType("r"))
+parser.add_argument("--inputDirPrefix")
+parser.add_argument("--inputFile", required=True)
 parser.add_argument("-o", "--outputFile", required=True)
 args = parser.parse_args()
 
 
 with open(args.outputFile, "wt", encoding="utf8") as ofile:
-    for f in args.inputFile:
-        ofile.write(f.read())
+    with open(args.inputFile, "rt", encoding="utf8") as ifile:
+        ofile.write(ifile.read())
diff --git a/tests/jobs/copyf.py b/tests/jobs/copyf.py
@@ -0,0 +1,30 @@
+import shutil
+import sys
+from pathlib import Path
+
+
+def main():
+    print("copyf job runnint")
+    if len(sys.argv) != 2:
+        print("Usage: python copy_file.py <filename>")
+        sys.exit(1)
+
+    original_path = Path(sys.argv[1])
+
+    if not original_path.exists() or not original_path.is_file():
+        print(f"Error: '{original_path}' does not exist or is not a file.")
+        sys.exit(1)
+
+    # Create a new filename like 'example_copy.txt'
+    new_name = original_path.absolute().parent.joinpath("chunk_1.smi")
+    new_path = original_path.with_name(new_name.name)
+    shutil.copyfile(original_path, new_path)
+
+    new_name = original_path.absolute().parent.joinpath("chunk_2.smi")
+    new_path = original_path.with_name(new_name.name)
+
+    shutil.copyfile(original_path, new_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/jobs/copyf.sh b/tests/jobs/copyf.sh
@@ -0,0 +1,4 @@
+#! /bin/bash
+
+cp "$1" chunk_1.smi
+cp "$1" chunk_2.smi
diff --git a/tests/jobs/split-smi.sh b/tests/jobs/split-smi.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+set -euo pipefail
+
+if [[ $# -lt 3 || $# -gt 4 ]]; then
+  echo "Usage: $0 <input_file(.smi or .smi.gz)> <lines_per_file> <output_basename> [has_header: yes]"
+  exit 1
+fi
+
+input_file="$1"
+lines_per_file="$2"
+base_name="$3"
+has_header="${4:-no}"
+
+# Determine how to read the file (plain text or gzipped)
+if [[ "$input_file" == *.gz ]]; then
+  reader="zcat"
+else
+  reader="cat"
+fi
+
+if ! [[ -f "$input_file" ]]; then
+  echo "Error: File '$input_file' not found"
+  exit 1
+fi
+
+# Extract header if present
+if [[ "$has_header" == "yes" ]]; then
+  header="$($reader "$input_file" | head -n1)"
+  data_start=2
+else
+  header=""
+  data_start=1
+fi
+
+# Count number of data lines (excluding header if present)
+data_lines="$($reader "$input_file" | tail -n +"$data_start" | wc -l)"
+if [[ "$data_lines" -eq 0 ]]; then
+  echo "No data lines to process."
+  exit 0
+fi
+
+# Calculate number of output files and required zero padding
+num_files=$(( (data_lines + lines_per_file - 1) / lines_per_file ))
+pad_width=0
+if [[ "$num_files" -gt 1 ]]; then
+  pad_width=${#num_files}
+fi
+
+# Split logic
+$reader "$input_file" | tail -n +"$data_start" | awk -v header="$header" -v lines="$lines_per_file" -v base="$base_name" -v pad="$pad_width" '
+function new_file() {
+  suffix = (pad > 0) ? sprintf("%0*d", pad, file_index) : file_index
+  file = base "_" suffix ".smi"
+  if (header != "") {
+    print header > file
+  }
+  file_index++
+  line_count = 0
+}
+{
+  if (line_count == 0) {
+    new_file()
+  }
+  print >> file
+  line_count++
+  if (line_count == lines) {
+    close(file)
+    print file " created"
+    line_count = 0
+  }
+}
+' file_index=1
diff --git a/tests/message_dispatcher.py b/tests/message_dispatcher.py
@@ -1,17 +1,17 @@
 """The UnitTest Message Dispatcher.
 
-A very simple object that relies on an underlying message queue.
+A very simple object that relies on an underlying message queue and is designed
+to emulate the behaviour of the message queue used in the Data Manager.
 Here we offer a minimal implementation that simply sends a (protocol buffer) message
 to the queue.
 """
 
 from google.protobuf.message import Message
 
 from tests.message_queue import UnitTestMessageQueue
-from workflow.workflow_abc import MessageDispatcher
 
 
-class UnitTestMessageDispatcher(MessageDispatcher):
+class UnitTestMessageDispatcher:
     """A minimal Message dispatcher to support testing."""
 
     def __init__(self, msg_queue: UnitTestMessageQueue):