fix: it's a mess

kaliif · kaliif · commit cc5d8cc71dc5 · 2025-07-18T16:56:11.000+01:00
diff --git a/tests/job-definitions/job-definitions.yaml b/tests/job-definitions/job-definitions.yaml
@@ -132,3 +132,7 @@ jobs:
   concatenate:
     command: >-
       concatenate.py {% for ifile in  inputFile %}{{ ifile }} {% endfor %} --outputFile {{ outputFile }}
+
+  splitsmiles:
+    command: >-
+      copyf.py {{ inputFile }}
diff --git a/tests/jobs/copyf.py b/tests/jobs/copyf.py
@@ -0,0 +1,30 @@
+import shutil
+import sys
+from pathlib import Path
+
+
+def main():
+    print("copyf job runnint")
+    if len(sys.argv) != 2:
+        print("Usage: python copy_file.py <filename>")
+        sys.exit(1)
+
+    original_path = Path(sys.argv[1])
+
+    if not original_path.exists() or not original_path.is_file():
+        print(f"Error: '{original_path}' does not exist or is not a file.")
+        sys.exit(1)
+
+    # Create a new filename like 'example_copy.txt'
+    new_name = original_path.absolute().parent.joinpath("chunk_1.smi")
+    new_path = original_path.with_name(new_name.name)
+    shutil.copyfile(original_path, new_path)
+
+    new_name = original_path.absolute().parent.joinpath("chunk_2.smi")
+    new_path = original_path.with_name(new_name.name)
+
+    shutil.copyfile(original_path, new_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/jobs/copyf.sh b/tests/jobs/copyf.sh
@@ -0,0 +1,4 @@
+#! /bin/bash
+
+cp "$1" chunk_1.smi
+cp "$1" chunk_2.smi
diff --git a/tests/jobs/split-smi.sh b/tests/jobs/split-smi.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+set -euo pipefail
+
+if [[ $# -lt 3 || $# -gt 4 ]]; then
+  echo "Usage: $0 <input_file(.smi or .smi.gz)> <lines_per_file> <output_basename> [has_header: yes]"
+  exit 1
+fi
+
+input_file="$1"
+lines_per_file="$2"
+base_name="$3"
+has_header="${4:-no}"
+
+# Determine how to read the file (plain text or gzipped)
+if [[ "$input_file" == *.gz ]]; then
+  reader="zcat"
+else
+  reader="cat"
+fi
+
+if ! [[ -f "$input_file" ]]; then
+  echo "Error: File '$input_file' not found"
+  exit 1
+fi
+
+# Extract header if present
+if [[ "$has_header" == "yes" ]]; then
+  header="$($reader "$input_file" | head -n1)"
+  data_start=2
+else
+  header=""
+  data_start=1
+fi
+
+# Count number of data lines (excluding header if present)
+data_lines="$($reader "$input_file" | tail -n +"$data_start" | wc -l)"
+if [[ "$data_lines" -eq 0 ]]; then
+  echo "No data lines to process."
+  exit 0
+fi
+
+# Calculate number of output files and required zero padding
+num_files=$(( (data_lines + lines_per_file - 1) / lines_per_file ))
+pad_width=0
+if [[ "$num_files" -gt 1 ]]; then
+  pad_width=${#num_files}
+fi
+
+# Split logic
+$reader "$input_file" | tail -n +"$data_start" | awk -v header="$header" -v lines="$lines_per_file" -v base="$base_name" -v pad="$pad_width" '
+function new_file() {
+  suffix = (pad > 0) ? sprintf("%0*d", pad, file_index) : file_index
+  file = base "_" suffix ".smi"
+  if (header != "") {
+    print header > file
+  }
+  file_index++
+  line_count = 0
+}
+{
+  if (line_count == 0) {
+    new_file()
+  }
+  print >> file
+  line_count++
+  if (line_count == lines) {
+    close(file)
+    print file " created"
+    line_count = 0
+  }
+}
+' file_index=1
diff --git a/tests/test_workflow_engine_examples.py b/tests/test_workflow_engine_examples.py
@@ -83,6 +83,7 @@ def start_workflow(
         variables=variables,
         level=ValidationLevel.RUN,
     )
+    print("vr_result", vr_result)
     assert vr_result.error_num == 0
     # 3.
     response = da.create_running_workflow(
@@ -401,22 +402,44 @@ def test_workflow_engine_simple_python_molprops_with_options(basic_engine):
     assert project_file_exists(output_file_2)
 
 
-def test_workflow_engine_simple_python_parallel(basic_engine):
+def test_workflow_engine_simple_python_fanout(basic_engine):
     # Arrange
     md, da = basic_engine
+
+    da.mock_get_running_workflow_step_output_values_for_output(
+        step_name="first-step",
+        output_variable="outputFile",
+        output=["chunk_1.smi", "chunk_2.smi"],
+    )
+
+    # da.mock_get_running_workflow_step_output_values_for_output(
+    #     step_name="parallel-step",
+    #     output_variable="outputFile",
+    #     output=["chunk_1_proc.smi", "chunk_2_proc.smi"]
+    # )
+
+    # da.mock_get_running_workflow_step_output_values_for_output(
+    #     step_name="final-step",
+    #     output_variable="outputFile",
+    #     output=["final-step.out.smi"],
+    # )
+
     # Make sure files that should be generated by the test
     # do not exist before we run the test.
-    output_file_first = "first-step.out.smi"
+    output_file_first = "chunk_1.smi"
+    output_file_second = "chunk_2.smi"
     assert not project_file_exists(output_file_first)
-    output_file_pa = "parallel-step-a.out.smi"
-    assert not project_file_exists(output_file_pa)
-    output_file_pb = "parallel-step-b.out.smi"
-    assert not project_file_exists(output_file_pb)
-    output_file_final = "final-step.out.smi"
-    assert not project_file_exists(output_file_final)
+    assert not project_file_exists(output_file_second)
+    output_file_p_first = "chunk_1_proc.smi"
+    output_file_p_second = "chunk_2_proc.smi"
+    assert not project_file_exists(output_file_p_first)
+    assert not project_file_exists(output_file_p_second)
+    # output_file_final = "final-step.out.smi"
+    # assert not project_file_exists(output_file_final)
     # And create the test's input file.
     input_file_1 = "input1.smi"
-    input_file_1_content = "O=C(CSCc1ccc(Cl)s1)N1CCC(O)CC1"
+    input_file_1_content = """O=C(CSCc1ccc(Cl)s1)N1CCC(O)CC1
+    COCN1C(=O)NC(C)(C)C1=O"""
     with open(
         f"{EXECUTION_DIRECTORY}/{input_file_1}", mode="wt", encoding="utf8"
     ) as input_file:
@@ -426,7 +449,7 @@ def test_workflow_engine_simple_python_parallel(basic_engine):
     r_wfid = start_workflow(
         md,
         da,
-        "simple-python-parallel",
+        "simple-python-fanout",
         {"candidateMolecules": input_file_1},
     )
 
@@ -435,16 +458,17 @@ def test_workflow_engine_simple_python_parallel(basic_engine):
     # Additional, detailed checks...
     # Check we only have one RunningWorkflowStep, and it succeeded
     response = da.get_running_workflow_steps(running_workflow_id=r_wfid)
+    print("response", response)
 
-    assert response["count"] == 4
+    assert response["count"] == 2
     assert response["running_workflow_steps"][0]["done"]
     assert response["running_workflow_steps"][0]["success"]
     assert response["running_workflow_steps"][1]["done"]
     assert response["running_workflow_steps"][1]["success"]
-    assert response["running_workflow_steps"][2]["done"]
-    assert response["running_workflow_steps"][2]["success"]
-    assert response["running_workflow_steps"][3]["done"]
-    assert response["running_workflow_steps"][3]["success"]
+    # assert response["running_workflow_steps"][2]["done"]
+    # assert response["running_workflow_steps"][2]["success"]
+    # assert response["running_workflow_steps"][3]["done"]
+    # assert response["running_workflow_steps"][3]["success"]
     # This test should generate a file in the simulated project directory
-    assert project_file_exists(output_file_first)
-    assert project_file_exists(output_file_final)
+    # assert project_file_exists(output_file_first)
+    # assert project_file_exists(output_file_final)
diff --git a/tests/wapi_adapter.py b/tests/wapi_adapter.py
@@ -202,6 +202,7 @@ def get_running_workflow_step_by_name(
         for rwfs_id, record in running_workflow_step.items():
             if record["running_workflow"]["id"] != running_workflow_id:
                 continue
+            print("running wf step by name, record:", record)
             if record["name"] == name and record["replica"] == replica:
                 response = record
                 response["id"] = rwfs_id
@@ -413,6 +414,11 @@ def get_running_workflow_step_output_values_for_output(
             mock_output = Unpickler(pickle_file).load()
         UnitTestWorkflowAPIAdapter.lock.release()
 
+        print("mock output", mock_output)
+        print("step", step)
+        print("step_name", step_name)
+        # mock output {'first-step': {'output_variable': 'results', 'output': ['chunk_1.smi', 'chunk_2.smi']}}
+
         if step_name not in mock_output:
             return {"output": []}, 0
         # The record's output variable must match (there's only one record per step atm)
diff --git a/tests/workflow-definitions/simple-python-fanout.yaml b/tests/workflow-definitions/simple-python-fanout.yaml
@@ -0,0 +1,70 @@
+---
+kind: DataManagerWorkflow
+kind-version: "2025.2"
+name: python-workflow
+description: >-
+  A simple parallel workflow. Input is split into N chunks and N processes of the same job is started
+variable-mapping:
+  inputs:
+  - name: candidateMolecules
+  outputs:
+  - name: clusteredMolecules
+    from:
+      step: final-step
+      output: outputFile
+
+
+steps:
+
+- name: first-step
+  description: Create inputs
+  specification:
+    collection: workflow-engine-unit-test-jobs
+    job: splitsmiles
+    version: "1.0.0"
+    variables:
+      name: "count"
+      value: "1"
+  inputs:
+  - input: inputFile
+    from:
+      workflow-input: candidateMolecules
+  outputs:
+  - output: outputFile
+    # as: chunk_*.smi
+
+- name: parallel-step
+  description: Add some params
+  specification:
+    collection: workflow-engine-unit-test-jobs
+    job: append-col
+    version: "1.0.0"
+    variables:
+      name: "desc1"
+      value: "777"
+  replicate:
+    using:
+      input: inputFile
+  inputs:
+  - input: inputFile
+    from:
+      step: first-step
+      output: outputFile
+  outputs:
+  - output: outputFile
+    # as: parallel-step.out.smi
+
+# - name: final-step
+#   description: Collate results
+#   specification:
+#     collection: workflow-engine-unit-test-jobs
+#     job: concatenate
+#     version: "1.0.0"
+#   inputs:
+#   - input: inputFile
+#     from:
+#       step: parallel-step
+#       output: outputFile
+#   outputs:
+#   - output: outputFile
+#     # as: final-step.out.smi
diff --git a/workflow/decoder.py b/workflow/decoder.py
@@ -213,6 +213,7 @@ def set_step_variables(
     workflow: dict[str, Any],
     inputs: list[dict[str, Any]],
     outputs: list[dict[str, Any]],
+    step_outputs: dict[str, Any],
     previous_step_outputs: list[dict[str, Any]],
     workflow_variables: dict[str, Any],
     step_name: str,
@@ -224,6 +225,13 @@ def set_step_variables(
     """
     result = {}
 
+    print("ssv: wf vars", workflow_variables)
+    print("ssv: inputs", inputs)
+    print("ssv: outputs", outputs)
+    print("ssv: step_outputs", step_outputs)
+    print("ssv: prev step outputs", previous_step_outputs)
+    print("ssv: step_name", step_name)
+
     for item in inputs:
         p_key = item["input"]
         p_val = ""
@@ -234,7 +242,16 @@ def set_step_variables(
         elif "step" in val.keys():
             for out in previous_step_outputs:
                 if out["output"] == val["output"]:
-                    p_val = out["as"]
+                    # p_val = out["as"]
+                    if step_outputs["output"]:
+                        p_val = step_outputs["output"]
+                        print("\n!!!!!!!!!!!!!if clause!!!!!!!!!!!!!!!!!!!!!\n")
+                        print(p_val)
+                    else:
+                        # what do I need to do here??
+                        print("\n!!!!!!!!!!!!!else clause!!!!!!!!!!!!!!!!!!!!!\n")
+                        print(out)
+                        print(val)
 
                     # this bit handles multiple inputs: if a step
                     # requires input from multiple steps, add them to
@@ -250,7 +267,9 @@ def set_step_variables(
 
     for item in outputs:
         p_key = item["output"]
-        p_val = item["as"]
+        # p_val = item["as"]
+        # p_val = step_outputs["output"]
+        p_val = "somefile.smi"
         result[p_key] = p_val
 
     options = set_variables_from_options_for_step(
diff --git a/workflow/workflow-schema.yaml b/workflow/workflow-schema.yaml
@@ -206,17 +206,17 @@ definitions:
     - from
 
   # A Step output (with an 'as' - a declared value)
-  step-output-as:
-    type: object
-    additionalProperties: false
-    properties:
-      output:
-        $ref: '#/definitions/template-variable-name'
-      as:
-        $ref: '#/definitions/file-name'
-    required:
-    - output
-    - as
+  # step-output-as:
+  #   type: object
+  #   additionalProperties: false
+  #   properties:
+  #     output:
+  #       $ref: '#/definitions/template-variable-name'
+  #     as:
+  #       $ref: '#/definitions/file-name'
+  #   required:
+  #   - output
+  #   - as
 
 
   # A step specification variable
@@ -282,9 +282,9 @@ definitions:
           - $ref: "#/definitions/step-input-from-workflow"
       outputs:
         type: array
-        items:
-          anyOf:
-          - $ref: "#/definitions/step-output-as"
+        # items:
+        #   anyOf:
+        #   - $ref: "#/definitions/step-output-as"
     required:
     - name
     - specification
diff --git a/workflow/workflow_engine.py b/workflow/workflow_engine.py