Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ The project's written in Python and uses `Poetry`_ for dependency and package
management. We also use `pre-commit`_ to manage our pre-commit hooks, which
rely on `black`_, `mypy`_, `pylint`_, amongst others.

From within a VS Code `devcontainer`_] environment (recommended)::
From within a VS Code `devcontainer`_ environment (recommended)::

poetry install --with dev --sync
pre-commit install -t commit-msg -t pre-commit
Expand Down
543 changes: 306 additions & 237 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ packages = [
[tool.poetry.dependencies]
python = "^3.12"
im-protobuf = "^8.2.0"
im-data-manager-job-decoder = "^2.1.0"
im-data-manager-job-decoder = "^2.5.0"
jsonschema = "^4.21.1"
pyyaml = ">= 5.3.1, < 7.0"

Expand Down
27 changes: 21 additions & 6 deletions tests/instance_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,32 @@ def __init__(
elif os.path.isdir(file_path):
shutil.rmtree(file_path)

def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
def launch(self, *, launch_parameters: LaunchParameters) -> LaunchResult:
assert launch_parameters
assert launch_parameters.project_id == TEST_PROJECT_ID
assert launch_parameters.specification
assert isinstance(launch_parameters.specification, dict)

os.makedirs(EXECUTION_DIRECTORY, exist_ok=True)

# Create an Instance record (and dummy Task ID)
response = self._api_adapter.create_instance(
running_workflow_step_id=launch_parameters.running_workflow_step_id
# Create a running workflow step
assert launch_parameters.running_workflow_id
assert launch_parameters.step_name
response, _ = self._api_adapter.create_running_workflow_step(
running_workflow_id=launch_parameters.running_workflow_id,
step=launch_parameters.step_name,
replica=launch_parameters.step_replication_number,
)
assert "id" in response
rwfs_id: str = response["id"]
# And add the variables we've been provided with
if launch_parameters.variables:
_ = self._api_adapter.set_running_workflow_step_variables(
running_workflow_step_id=rwfs_id, variables=launch_parameters.variables
)

# Create an Instance record (and dummy Task ID)
response = self._api_adapter.create_instance(running_workflow_step_id=rwfs_id)
instance_id = response["id"]
task_id = "task-00000000-0000-0000-0000-000000000001"

Expand All @@ -96,8 +110,8 @@ def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
# The command may not need any, but we do the decoding anyway.
decoded_command, status = job_decoder.decode(
job["command"],
launch_parameters.specification_variables,
launch_parameters.running_workflow_step_id,
launch_parameters.variables,
rwfs_id,
TextEncoding.JINJA2_3_0,
)
print(f"Decoded command: {decoded_command}")
Expand Down Expand Up @@ -129,6 +143,7 @@ def launch(self, launch_parameters: LaunchParameters) -> LaunchResult:
self._msg_dispatcher.send(pod_message)

return LaunchResult(
running_workflow_step_id=rwfs_id,
instance_id=instance_id,
task_id=task_id,
command=" ".join(subprocess_cmd),
Expand Down
11 changes: 11 additions & 0 deletions tests/job-definitions/job-definitions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,14 @@ jobs:
concatenate:
command: >-
concatenate.py {% for ifile in inputFile %}{{ ifile }} {% endfor %} --outputFile {{ outputFile }}

splitsmiles:
command: >-
copyf.py {{ inputFile }}
# Simulate multiple output files...
variables:
outputs:
properties:
outputBase:
creates: '{{ outputBase }}_*.smi'
type: files
30 changes: 30 additions & 0 deletions tests/jobs/copyf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import shutil
import sys
from pathlib import Path


def main():
print("copyf job runnint")
if len(sys.argv) != 2:
print("Usage: python copy_file.py <filename>")
sys.exit(1)

original_path = Path(sys.argv[1])

if not original_path.exists() or not original_path.is_file():
print(f"Error: '{original_path}' does not exist or is not a file.")
sys.exit(1)

# Create a new filename like 'example_copy.txt'
new_name = original_path.absolute().parent.joinpath("chunk_1.smi")
new_path = original_path.with_name(new_name.name)
shutil.copyfile(original_path, new_path)

new_name = original_path.absolute().parent.joinpath("chunk_2.smi")
new_path = original_path.with_name(new_name.name)

shutil.copyfile(original_path, new_path)


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions tests/jobs/copyf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#! /bin/bash

cp "$1" chunk_1.smi
cp "$1" chunk_2.smi
72 changes: 72 additions & 0 deletions tests/jobs/split-smi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
set -euo pipefail

if [[ $# -lt 3 || $# -gt 4 ]]; then
echo "Usage: $0 <input_file(.smi or .smi.gz)> <lines_per_file> <output_basename> [has_header: yes]"
exit 1
fi

input_file="$1"
lines_per_file="$2"
base_name="$3"
has_header="${4:-no}"

# Determine how to read the file (plain text or gzipped)
if [[ "$input_file" == *.gz ]]; then
reader="zcat"
else
reader="cat"
fi

if ! [[ -f "$input_file" ]]; then
echo "Error: File '$input_file' not found"
exit 1
fi

# Extract header if present
if [[ "$has_header" == "yes" ]]; then
header="$($reader "$input_file" | head -n1)"
data_start=2
else
header=""
data_start=1
fi

# Count number of data lines (excluding header if present)
data_lines="$($reader "$input_file" | tail -n +"$data_start" | wc -l)"
if [[ "$data_lines" -eq 0 ]]; then
echo "No data lines to process."
exit 0
fi

# Calculate number of output files and required zero padding
num_files=$(( (data_lines + lines_per_file - 1) / lines_per_file ))
pad_width=0
if [[ "$num_files" -gt 1 ]]; then
pad_width=${#num_files}
fi

# Split logic
$reader "$input_file" | tail -n +"$data_start" | awk -v header="$header" -v lines="$lines_per_file" -v base="$base_name" -v pad="$pad_width" '
function new_file() {
suffix = (pad > 0) ? sprintf("%0*d", pad, file_index) : file_index
file = base "_" suffix ".smi"
if (header != "") {
print header > file
}
file_index++
line_count = 0
}
{
if (line_count == 0) {
new_file()
}
print >> file
line_count++
if (line_count == lines) {
close(file)
print file " created"
line_count = 0
}
}
' file_index=1
Loading