added avoid_clashes option to copy_nested_files to ensure names don't clash when copying files with the same name from multiple node outputs

tclose · tclose · commit 128a3e9bb81a · 2025-02-13T21:40:45.000+11:00
diff --git a/new-docs/source/tutorial/1-getting-started.ipynb b/new-docs/source/tutorial/1-getting-started.ipynb
@@ -30,6 +30,7 @@
    "source": [
     "from pathlib import Path\n",
     "from tempfile import mkdtemp\n",
+    "from pprint import pprint\n",
     "import json\n",
     "\n",
     "JSON_CONTENTS = {'a': True, 'b': 'two', 'c': 3, 'd': [7, 0.55, 6]}\n",
@@ -160,7 +161,7 @@
     "outputs = mrgrid()\n",
     "\n",
     "# Print the locations of the output files\n",
-    "print(\"\\n\".join(str(p) for p in outputs.out_file))"
+    "pprint(outputs.out_file)"
    ]
   },
   {
@@ -184,8 +185,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "\n",
     "mrgrid_varying_vox_sizes = MrGrid(operation=\"regrid\").split(\n",
     "    (\"in_file\", \"voxel\"),\n",
     "    in_file=nifti_dir.iterdir(),\n",
@@ -205,7 +204,9 @@
     "    ],\n",
     ")\n",
     "\n",
-    "print(\"\\n\".join(str(p) for p in outputs.out_file))"
+    "outputs = mrgrid_varying_vox_sizes()\n",
+    "\n",
+    "pprint(outputs.out_file)"
    ]
   },
   {
@@ -277,7 +278,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "outputs = mrgrid(cache_root=Path(\"~/pydra-cache\").expanduser())"
+    "outputs = mrgrid(cache_dir=Path(\"~/pydra-cache\").expanduser())\n",
+    "\n",
+    "pprint(outputs)"
    ]
   },
   {
@@ -296,7 +299,7 @@
    "source": [
     "from pydra.utils import default_run_cache_dir\n",
     "\n",
-    "my_cache_dir = Path(\"~/pydra-cache\").expanduser()\n",
+    "my_cache_dir = Path(\"~/new-pydra-cache\").expanduser()\n",
     "my_cache_dir.mkdir(exist_ok=True)\n",
     "\n",
     "outputs = mrgrid(\n",
diff --git a/new-docs/source/tutorial/tst.py b/new-docs/source/tutorial/tst.py
@@ -1,25 +1,47 @@
-from pydra.tasks.testing import UnsafeDivisionWorkflow
-from pydra.engine.submitter import Submitter
+from pathlib import Path
+from tempfile import mkdtemp
+from pprint import pprint
+import json
+from pydra.utils.hash import hash_function
+from pydra.tasks.mrtrix3.v3_0 import MrGrid
+from fileformats.medimage import Nifti1
 
-# This workflow will fail because we are trying to divide by 0
-wf = UnsafeDivisionWorkflow(a=10, b=5, denominator=2)
+JSON_CONTENTS = {"a": True, "b": "two", "c": 3, "d": [7, 0.55, 6]}
 
-if __name__ == "__main__":
-    with Submitter(worker="cf", rerun=True) as sub:
-        result = sub(wf)
+test_dir = Path(mkdtemp())
+cache_root = Path(mkdtemp())
+json_file = test_dir / "test.json"
+with open(json_file, "w") as f:
+    json.dump(JSON_CONTENTS, f)
 
+nifti_dir = test_dir / "nifti"
+nifti_dir.mkdir()
 
-# from pydra.tasks.testing import UnsafeDivisionWorkflow
-# from pydra.engine.submitter import Submitter
+for i in range(10):
+    Nifti1.sample(nifti_dir, seed=i)  # Create a dummy NIfTI file in the dest. directory
 
-# # This workflow will fail because we are trying to divide by 0
-# failing_workflow = UnsafeDivisionWorkflow(a=10, b=5).split(denominator=[3, 2, 0])
+niftis = list(nifti_dir.iterdir())
+pprint([hash_function(nifti) for nifti in niftis])
 
-# if __name__ == "__main__":
-#     with Submitter(worker="cf") as sub:
-#         result = sub(failing_workflow)
+mrgrid_varying_vox_sizes = MrGrid(operation="regrid").split(
+    ("in_file", "voxel"),
+    in_file=niftis,
+    # Define a list of voxel sizes to resample the NIfTI files to,
+    # the list must be the same length as the list of NIfTI files
+    voxel=[
+        (1.0, 1.0, 1.0),
+        (1.0, 1.0, 1.0),
+        (1.0, 1.0, 1.0),
+        (0.5, 0.5, 0.5),
+        (0.75, 0.75, 0.75),
+        (0.5, 0.5, 0.5),
+        (0.5, 0.5, 0.5),
+        (1.0, 1.0, 1.0),
+        (1.25, 1.25, 1.25),
+        (1.25, 1.25, 1.25),
+    ],
+)
 
-#     if result.errored:
-#         print("Workflow failed with errors:\n" + str(result.errors))
-#     else:
-#         print("Workflow completed successfully :)")
+outputs = mrgrid_varying_vox_sizes(cache_dir=cache_root)
+
+pprint(outputs.out_file)
diff --git a/pydra/engine/helpers_file.py b/pydra/engine/helpers_file.py
@@ -9,7 +9,7 @@
 import subprocess as sp
 from contextlib import contextmanager
 import attr
-from fileformats.core import FileSet
+from fileformats.generic import FileSet
 from pydra.engine.helpers import is_lazy, attrs_values, list_fields
 
 
@@ -77,6 +77,10 @@ def copy_nested_files(
 
     cache: ty.Dict[FileSet, FileSet] = {}
 
+    # Set to keep track of file paths that have already been copied
+    # to allow FileSet.copy to avoid name clashes
+    clashes_to_avoid = set()
+
     def copy_fileset(fileset: FileSet):
         try:
             return cache[fileset]
@@ -89,7 +93,15 @@ def copy_fileset(fileset: FileSet):
             MountIndentifier.on_same_mount(p, dest_dir) for p in fileset.fspaths
         ):
             supported -= FileSet.CopyMode.hardlink
-        copied = fileset.copy(dest_dir=dest_dir, supported_modes=supported, **kwargs)
+        cp_kwargs = {}
+
+        cp_kwargs.update(kwargs)
+        copied = fileset.copy(
+            dest_dir=dest_dir,
+            supported_modes=supported,
+            avoid_clashes=clashes_to_avoid,  # this prevents fname clashes between filesets
+            **kwargs,
+        )
         cache[fileset] = copied
         return copied