debugged advanced execution tutorial

tclose · tclose · commit 6f60ae154ff1 · 2025-02-14T09:03:08.000+11:00
diff --git a/new-docs/source/tutorial/2-advanced-execution.ipynb b/new-docs/source/tutorial/2-advanced-execution.ipynb
@@ -11,7 +11,10 @@
     "executed (e.g. on the cloud, on a HPC cluster, ...). This tutorial steps you through\n",
     "some of the available options for executing a task.\n",
     "\n",
-    "[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/nipype/pydra-tutorial/develop/notebooks/tutorial/advanced_execution.ipynb)"
+    "[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/nipype/pydra-tutorial/develop/notebooks/tutorial/advanced_execution.ipynb)\n",
+    "\n",
+    "Remember that before attempting to run multi-process code in Jupyter notebooks, the\n",
+    "following snippet must be called"
    ]
   },
   {
@@ -30,20 +33,8 @@
    "source": [
     "## Submitter\n",
     "\n",
-    "If you want to access a richer `Result` object you can use a Submitter object to execute the following task"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pydra.design import python\n",
-    "\n",
-    "@python.define\n",
-    "def TenToThePower(p: int) -> int:\n",
-    "    return 10 ** p"
+    "If you want to access a richer `Result` object you can use a Submitter object to initiate\n",
+    "the task execution. For example, using the `TenToThePower` task from the testing package"
    ]
   },
   {
@@ -53,6 +44,8 @@
    "outputs": [],
    "source": [
     "from pydra.engine.submitter import Submitter\n",
+    "from pydra.tasks.testing import TenToThePower\n",
+    "\n",
     "\n",
     "ten_to_the_power = TenToThePower(p=3)\n",
     "\n",
@@ -110,7 +103,11 @@
     "class itself. Additional parameters can be passed to the worker initialisation as keyword\n",
     "arguments to the execution call. For example, if we wanted to run five tasks using the\n",
     "ConcurentFutures worker but only use three CPUs, we can pass `n_procs=3` to the execution\n",
-    "call."
+    "call.\n",
+    "\n",
+    "Remember that when calling multi-process code in a top level script the call must be\n",
+    "enclosed within a `if __name__ == \"__main__\"` block to allow the worker processes to\n",
+    "import the module without re-executing it."
    ]
   },
   {
@@ -119,14 +116,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pydra.design import python\n",
+    "import tempfile\n",
+    "\n",
+    "cache_root = tempfile.mkdtemp()\n",
     "\n",
     "if __name__ == \"__main__\":\n",
     "\n",
     "    ten_to_the_power = TenToThePower().split(p=[1, 2, 3, 4, 5])\n",
     "\n",
     "    # Run the 5 tasks in parallel split across 3 processes\n",
-    "    outputs = ten_to_the_power(worker=\"cf\", n_procs=3)\n",
+    "    outputs = ten_to_the_power(worker=\"cf\", n_procs=3, cache_dir=cache_root)\n",
     "\n",
     "    p1, p2, p3, p4, p5 = outputs.out\n",
     "\n",
@@ -168,7 +167,9 @@
     "as long as exactly the hashes of the inputs provided to the task are the same. Here we\n",
     "go through some of the practicalities of this caching and hashing (see\n",
     "[Caches and hashes](../explanation/hashing-caching.html) for more details and issues\n",
-    "to consider)."
+    "to consider).\n",
+    "\n",
+    "First we import the functions and classes we need andcreate some sample NIfTI files to work with"
    ]
   },
   {
@@ -179,37 +180,18 @@
    "source": [
     "from pathlib import Path\n",
     "import tempfile\n",
+    "from pprint import pprint\n",
     "from fileformats.medimage import Nifti1\n",
     "from pydra.engine.submitter import Submitter\n",
     "from pydra.tasks.mrtrix3.v3_0 import MrGrid\n",
     "\n",
-    "# Make directory filled with nifti files\n",
+    "# Make a temporary directory\n",
     "test_dir = Path(tempfile.mkdtemp())\n",
     "nifti_dir = test_dir / \"nifti\"\n",
     "nifti_dir.mkdir()\n",
-    "for i in range(10):\n",
-    "    Nifti1.sample(nifti_dir, seed=i)\n",
-    "\n",
-    "# Instantiate the task definition, \"splitting\" over all NIfTI files in the test directory\n",
-    "# by splitting the \"input\" input field over all files in the directory\n",
-    "mrgrid = MrGrid(operation=\"regrid\", voxel=(0.5, 0.5, 0.5)).split(\n",
-    "    in_file=nifti_dir.iterdir()\n",
-    ")\n",
-    "\n",
-    "# Run the task to resample all NIfTI files\n",
-    "outputs = mrgrid()\n",
-    "\n",
-    "# Create a new custom directory\n",
-    "cache_dir = test_dir / \"cache\"\n",
-    "cache_dir.mkdir()\n",
-    "\n",
-    "submitter = Submitter(cache_dir=cache_dir)\n",
-    "\n",
-    "# Run the task to resample all NIfTI files with different voxel sizes\n",
-    "with submitter:\n",
-    "    result1 = submitter(mrgrid)\n",
     "\n",
-    "print(result1)"
+    "# Generate some random NIfTI files to work with\n",
+    "nifti_files = [Nifti1.sample(nifti_dir, seed=i) for i in range(10)]"
    ]
   },
   {
@@ -243,21 +225,20 @@
     "\n",
     "mrgrid_varying_vox = MrGrid(operation=\"regrid\").split(\n",
     "    (\"in_file\", \"voxel\"),\n",
-    "    in_file=nifti_dir.iterdir(),\n",
+    "    in_file=nifti_files,\n",
     "    voxel=VOX_SIZES,\n",
     ")\n",
     "\n",
     "submitter = Submitter(cache_dir=test_dir / \"cache\")\n",
     "\n",
     "\n",
-    "# Result from previous run is reused as the task and inputs are identical\n",
     "with submitter:\n",
     "    result1 = submitter(mrgrid_varying_vox)\n",
     "\n",
     "\n",
     "mrgrid_varying_vox2 = MrGrid(operation=\"regrid\").split(\n",
     "    (\"in_file\", \"voxel\"),\n",
-    "    in_file=nifti_dir.iterdir(),\n",
+    "    in_file=nifti_files,\n",
     "    voxel=copy(VOX_SIZES),\n",
     ")\n",
     "\n",
@@ -298,30 +279,30 @@
    "outputs": [],
    "source": [
     "# Rename a NIfTI file within the test directory\n",
-    "first_file = next(nifti_dir.iterdir())\n",
-    "new_name = first_file.with_name(\"first.nii\")\n",
-    "first_file.rename(new_name)\n",
+    "nifti_files[0] = Nifti1(\n",
+    "    nifti_files[0].fspath.rename(nifti_files[0].fspath.with_name(\"first.nii\"))\n",
+    ")\n",
     "\n",
     "mrgrid_varying_vox3 = MrGrid(operation=\"regrid\").split(\n",
     "    (\"in_file\", \"voxel\"),\n",
-    "    in_file=nifti_dir.iterdir(),\n",
+    "    in_file=nifti_files,\n",
     "    voxel=VOX_SIZES,\n",
     ")\n",
     "\n",
-    "# Result from previous run is reused as the task and inputs are identical\n",
+    "# Result from previous run is reused as contents of the files have not changed, despite\n",
+    "# the file names changing\n",
     "with submitter:\n",
-    "    result3 = submitter(mrgrid_varying_vox3)\n",
+    "    result4 = submitter(mrgrid_varying_vox3)\n",
     "\n",
-    "assert result3.output_dir == result1.output_dir\n",
+    "assert result4.output_dir == result1.output_dir\n",
     "\n",
     "# Replace the first NIfTI file with a new file\n",
-    "new_name.unlink()\n",
-    "Nifti1.sample(nifti_dir, seed=100)\n",
+    "nifti_files[0] = Nifti1.sample(nifti_dir, seed=100)\n",
     "\n",
     "# Update the in_file input field to include the new file\n",
     "mrgrid_varying_vox4 = MrGrid(operation=\"regrid\").split(\n",
     "    (\"in_file\", \"voxel\"),\n",
-    "    in_file=nifti_dir.iterdir(),\n",
+    "    in_file=nifti_files,\n",
     "    voxel=VOX_SIZES,\n",
     ")\n",
     "\n",
@@ -333,19 +314,14 @@
     "assert result4.output_dir != result1.output_dir"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Environments\n",
+    "## Environments and hooks\n",
     "\n",
     "For shell tasks, it is possible to specify that the command runs within a specific\n",
-    "software environment, such as those provided by software containers (e.g. Docker or Apptainer).\n",
+    "software environment, such as those provided by software containers (e.g. Docker or Singularity/Apptainer).\n",
     "This is down by providing the environment to the submitter/execution call,"
    ]
   },
@@ -371,7 +347,7 @@
     "outputs = mrgrid(environment=Docker(image=\"mrtrix3/mrtrix3\", tag=\"latest\"))\n",
     "\n",
     "# Print the locations of the output files\n",
-    "print(\"\\n\".join(str(p) for p in outputs.out_file))"
+    "pprint(outputs.out_file)"
    ]
   },
   {
@@ -381,31 +357,61 @@
     "Of course for this to work Docker needs to work and be configured for\n",
     "[sudo-less execution](https://docs.docker.com/engine/install/linux-postinstall/).\n",
     "See [Containers and Environments](../explanation/environments.rst) for more details on\n",
-    "how to utilise containers and add support for other software environments."
+    "how to utilise containers and add support for other software environments.\n",
+    "\n",
+    "It is also possible to specify functions to run at hooks that are immediately before and after\n",
+    "the task is executed by passing a `pydra.engine.spec.TaskHooks` object to the `hooks`\n",
+    "keyword arg. The callable should take the `pydra.engine.core.Task` object as its only\n",
+    "argument and return None. The available hooks to attach functions are:\n",
+    "\n",
+    "* pre_run: before the task cache directory is created\n",
+    "* pre_run_task: after the cache directory has been created and the inputs resolved but before the task is executed\n",
+    "* post_run_task: after the task has been run and the outputs collected\n",
+    "* post_run: after the cache directory has been finalised\n",
+    "\n",
+    "\n",
+    "QUESTION: What are these hooks intended for? Should the post_run_task hook be run before the outputs have been\n",
+    "collected?"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Provenance and auditing\n",
+    "from pydra.engine.core import Task\n",
+    "from pydra.engine.specs import TaskHooks, Result\n",
+    "import os\n",
+    "import platform\n",
+    "\n",
+    "def notify_task_completion(task: Task, result: Result):\n",
+    "    # Print a message to the terminal\n",
+    "    print(f\"Task completed! Results are stored in {str(task.output_dir)!r}\")\n",
+    "\n",
+    "    # Platform-specific notifications\n",
+    "    if platform.system() == \"Darwin\":  # macOS\n",
+    "        os.system('osascript -e \\'display notification \"Task has completed successfully!\" with title \"Task Notification\"\\'')\n",
+    "    elif platform.system() == \"Linux\":  # Linux\n",
+    "        os.system('notify-send \"Task Notification\" \"Task has completed successfully!\"')\n",
+    "    elif platform.system() == \"Windows\":  # Windows\n",
+    "        os.system('msg * \"Task has completed successfully!\"')\n",
     "\n",
-    "Work in progress..."
+    "# Run the task to resample all NIfTI files\n",
+    "outputs = mrgrid(hooks=TaskHooks(post_run=notify_task_completion), cache_dir=tempfile.mkdtemp())\n",
+    "\n",
+    "# Print the locations of the output files\n",
+    "pprint(outputs.out_file)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Hooks\n",
+    "## Provenance and auditing\n",
     "\n",
     "Work in progress..."
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {
diff --git a/new-docs/source/tutorial/tst.py b/new-docs/source/tutorial/tst.py
@@ -1,47 +1,21 @@
-from pathlib import Path
-from tempfile import mkdtemp
-from pprint import pprint
-import json
-from pydra.utils.hash import hash_function
-from pydra.tasks.mrtrix3.v3_0 import MrGrid
-from fileformats.medimage import Nifti1
-
-JSON_CONTENTS = {"a": True, "b": "two", "c": 3, "d": [7, 0.55, 6]}
-
-test_dir = Path(mkdtemp())
-cache_root = Path(mkdtemp())
-json_file = test_dir / "test.json"
-with open(json_file, "w") as f:
-    json.dump(JSON_CONTENTS, f)
-
-nifti_dir = test_dir / "nifti"
-nifti_dir.mkdir()
-
-for i in range(10):
-    Nifti1.sample(nifti_dir, seed=i)  # Create a dummy NIfTI file in the dest. directory
-
-niftis = list(nifti_dir.iterdir())
-pprint([hash_function(nifti) for nifti in niftis])
-
-mrgrid_varying_vox_sizes = MrGrid(operation="regrid").split(
-    ("in_file", "voxel"),
-    in_file=niftis,
-    # Define a list of voxel sizes to resample the NIfTI files to,
-    # the list must be the same length as the list of NIfTI files
-    voxel=[
-        (1.0, 1.0, 1.0),
-        (1.0, 1.0, 1.0),
-        (1.0, 1.0, 1.0),
-        (0.5, 0.5, 0.5),
-        (0.75, 0.75, 0.75),
-        (0.5, 0.5, 0.5),
-        (0.5, 0.5, 0.5),
-        (1.0, 1.0, 1.0),
-        (1.25, 1.25, 1.25),
-        (1.25, 1.25, 1.25),
-    ],
-)
-
-outputs = mrgrid_varying_vox_sizes(cache_dir=cache_root)
-
-pprint(outputs.out_file)
+from pydra.design import python
+import shutil
+
+
+@python.define
+def TenToThePower(p: int) -> int:
+    return 10**p
+
+
+if __name__ == "__main__":
+
+    shutil.rmtree("/Users/tclose/Library/Caches/pydra/0.25.dev190+g6a726571/run-cache")
+
+    ten_to_the_power = TenToThePower().split(p=[1, 2, 3, 4, 5])
+
+    # Run the 5 tasks in parallel split across 3 processes
+    outputs = ten_to_the_power(worker="cf", n_procs=3)
+
+    p1, p2, p3, p4, p5 = outputs.out
+
+    print(f"10^5 = {p5}")
diff --git a/pydra/engine/specs.py b/pydra/engine/specs.py
@@ -261,7 +261,7 @@ def __call__(
             raise
         if result.errored:
             if isinstance(self, WorkflowDef) or self._splitter:
-                raise RuntimeError(f"Workflow {self} failed with errors:")
+                raise RuntimeError(f"Workflow {self} failed with errors")
             else:
                 errors = result.errors
                 raise RuntimeError(
diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py
@@ -114,9 +114,9 @@ def __init__(
         )
         if cache_dir is None:
             cache_dir = default_run_cache_dir
-            cache_dir.mkdir(parents=True, exist_ok=True)
-        elif not cache_dir.exists():
-            raise ValueError(f"Cache directory {str(cache_dir)!r} does not exist")
+        cache_dir = Path(cache_dir).resolve()
+        cache_dir.mkdir(parents=True, exist_ok=True)
+
         self.cache_dir = cache_dir
         self.cache_locations = cache_locations
         self.environment = environment
diff --git a/pydra/tasks/testing/__init__.py b/pydra/tasks/testing/__init__.py
@@ -75,3 +75,8 @@ def SafeDivisionWorkflow(a: float, b: float, denominator: float) -> float:
     divide = workflow.add(SafeDivide(x=add.out, y=denominator))
     subtract = workflow.add(Subtract(x=divide.out, y=b))
     return subtract.out
+
+
+@python.define
+def TenToThePower(p: int) -> int:
+    return 10**p