Integrate clustertools (#67)

philippotto · web-flow · commit fce8f8490999 · 2019-04-03T17:35:36.000+02:00
* use cluster futures for all tools which used ParallelExecutor before

* fix redundant argument

* unify jobs and processes arg

* clean up code and fix test

* format

* add missing args

* remove explicit checking for --jobs parameter

* use v1.0 of cluster_tools

* update readme

* fix how args is None was handled

* reformat

* use namedtuple to mimic args

* reformat

* fix missing import
diff --git a/README.md b/README.md
@@ -75,6 +75,10 @@ python -m wkcuber.compress --layer_name segmentation data/target data/target_com
 python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 data/target
 ```
 
+### Parallelization
+
+Most tasks can be configured to be executed in a parallelized manner. Via `--distribution_strategy` you can pass `multiprocessing` or `slurm`. The first can be further configured with `--jobs` and the latter via `--job_resources='{"mem": "10M"}'`. Use `--help` to get more information.
+
 ## Test data credits
 Excerpts for testing purposes have been sampled from:
 - Dow Jacobo Hossain Siletti Hudspeth (2018). **Connectomics of the zebrafish's lateral-line neuromast reveals wiring and miswiring in a simple microcircuit.** eLife. [DOI:10.7554/eLife.33988](https://elifesciences.org/articles/33988)
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ pytest
 wkw>=0.0.6
 requests
 black
+git+git://github.com/scalableminds/cluster_tools@v1.0#egg=cluster_tools
diff --git a/wkcuber/__main__.py b/wkcuber/__main__.py
@@ -6,7 +6,7 @@
 from .downsampling import downsample_mags, DEFAULT_EDGE_LEN
 from .compress import compress_mag_inplace
 from .metadata import write_webknossos_metadata
-from .utils import add_verbose_flag, add_jobs_flag
+from .utils import add_verbose_flag, add_distribution_flags
 from .mag import Mag
 
 
@@ -66,7 +66,7 @@ def create_parser():
     )
 
     add_verbose_flag(parser)
-    add_jobs_flag(parser)
+    add_distribution_flags(parser)
 
     return parser
 
@@ -84,10 +84,11 @@ def create_parser():
         args.dtype,
         args.batch_size,
         args.jobs,
+        args,
     )
 
     if not args.no_compress:
-        compress_mag_inplace(args.target_path, args.layer_name, Mag(1), args.jobs)
+        compress_mag_inplace(args.target_path, args.layer_name, Mag(1), args.jobs, args)
 
     downsample_mags(
         args.target_path,
diff --git a/wkcuber/compress.py b/wkcuber/compress.py
@@ -10,10 +10,11 @@
 
 from .utils import (
     add_verbose_flag,
-    add_jobs_flag,
     open_wkw,
     WkwDatasetInfo,
-    ParallelExecutor,
+    add_distribution_flags,
+    get_executor_for_args,
+    wait_and_ensure_success,
 )
 from .metadata import detect_resolutions
 from typing import List
@@ -44,8 +45,8 @@ def create_parser():
         "--mag", "-m", nargs="*", help="Magnification level", default=None
     )
 
-    add_jobs_flag(parser)
     add_verbose_flag(parser)
+    add_distribution_flags(parser)
 
     return parser
 
@@ -71,7 +72,7 @@ def compress_file_job(source_path, target_path):
         raise exc
 
 
-def compress_mag(source_path, layer_name, target_path, mag: Mag, jobs):
+def compress_mag(source_path, layer_name, target_path, mag: Mag, jobs, args=None):
     if path.exists(path.join(target_path, layer_name, str(mag))):
         logging.error("Target path '{}' already exists".format(target_path))
         exit(1)
@@ -80,18 +81,26 @@ def compress_mag(source_path, layer_name, target_path, mag: Mag, jobs):
     target_mag_path = path.join(target_path, layer_name, str(mag))
     logging.info("Compressing mag {0} in '{1}'".format(str(mag), target_mag_path))
 
-    with open_wkw(source_wkw_info) as source_wkw, ParallelExecutor(jobs) as pool:
+    with open_wkw(source_wkw_info) as source_wkw:
         source_wkw.compress(target_mag_path)
-        for file in source_wkw.list_files():
-            rel_file = path.relpath(file, source_wkw.root)
-            pool.submit(compress_file_job, file, path.join(target_mag_path, rel_file))
+        with get_executor_for_args(args) as executor:
+            futures = []
+            for file in source_wkw.list_files():
+                rel_file = path.relpath(file, source_wkw.root)
+                futures.append(
+                    executor.submit(
+                        compress_file_job, file, path.join(target_mag_path, rel_file)
+                    )
+                )
 
-    logging.info("Mag {0} succesfully compressed".format(str(mag)))
+            wait_and_ensure_success(futures)
 
+    logging.info("Mag {0} successfully compressed".format(str(mag)))
 
-def compress_mag_inplace(target_path, layer_name, mag: Mag, jobs):
+
+def compress_mag_inplace(target_path, layer_name, mag: Mag, jobs, args=None):
     compress_target_path = "{}.compress-{}".format(target_path, uuid4())
-    compress_mag(target_path, layer_name, compress_target_path, mag, jobs)
+    compress_mag(target_path, layer_name, compress_target_path, mag, jobs, args)
 
     shutil.rmtree(path.join(target_path, layer_name, str(mag)))
     shutil.move(
@@ -102,7 +111,7 @@ def compress_mag_inplace(target_path, layer_name, mag: Mag, jobs):
 
 
 def compress_mags(
-    source_path, layer_name, target_path=None, mags: List[Mag] = None, jobs=1
+    source_path, layer_name, target_path=None, mags: List[Mag] = None, jobs=1, args=None
 ):
     with_tmp_dir = target_path is None
     target_path = source_path + ".tmp" if with_tmp_dir else target_path
@@ -112,7 +121,7 @@ def compress_mags(
     mags.sort()
 
     for mag in mags:
-        compress_mag(source_path, layer_name, target_path, mag, jobs)
+        compress_mag(source_path, layer_name, target_path, mag, jobs, args)
 
     if with_tmp_dir:
         makedirs(path.join(source_path + ".bak", layer_name), exist_ok=True)
@@ -138,5 +147,10 @@ def compress_mags(
     if args.verbose:
         logging.basicConfig(level=logging.DEBUG)
     compress_mags(
-        args.source_path, args.layer_name, args.target_path, args.mag, int(args.jobs)
+        args.source_path,
+        args.layer_name,
+        args.target_path,
+        args.mag,
+        int(args.jobs),
+        args,
     )
diff --git a/wkcuber/convert_knossos.py b/wkcuber/convert_knossos.py
@@ -6,14 +6,15 @@
 from os import path
 
 from .utils import (
-    add_jobs_flag,
     add_verbose_flag,
     open_wkw,
     open_knossos,
     WkwDatasetInfo,
     KnossosDatasetInfo,
-    ParallelExecutor,
-    pool_get_lock,
+    ensure_wkw,
+    add_distribution_flags,
+    get_executor_for_args,
+    wait_and_ensure_success,
 )
 from .knossos import KnossosDataset, CUBE_EDGE_LEN
 
@@ -45,8 +46,8 @@ def create_parser():
 
     parser.add_argument("--mag", "-m", help="Magnification level", type=int, default=1)
 
-    add_jobs_flag(parser)
     add_verbose_flag(parser)
+    add_distribution_flags(parser)
 
     return parser
 
@@ -58,7 +59,7 @@ def convert_cube_job(cube_xyz, source_knossos_info, target_wkw_info):
     size = (CUBE_EDGE_LEN,) * 3
 
     with open_knossos(source_knossos_info) as source_knossos, open_wkw(
-        target_wkw_info, pool_get_lock()
+        target_wkw_info
     ) as target_wkw:
         cube_data = source_knossos.read(offset, size)
         target_wkw.write(offset, cube_data)
@@ -69,23 +70,30 @@ def convert_cube_job(cube_xyz, source_knossos_info, target_wkw_info):
     )
 
 
-def convert_knossos(source_path, target_path, layer_name, dtype, mag=1, jobs=1):
+def convert_knossos(
+    source_path, target_path, layer_name, dtype, mag=1, jobs=1, args=None
+):
     source_knossos_info = KnossosDatasetInfo(source_path, dtype)
     target_wkw_info = WkwDatasetInfo(target_path, layer_name, dtype, mag)
 
-    with open_knossos(source_knossos_info) as source_knossos, ParallelExecutor(
-        jobs
-    ) as pool:
-        knossos_cubes = list(source_knossos.list_cubes())
-        if len(knossos_cubes) == 0:
-            logging.error("No input KNOSSOS cubes found.")
-            exit(1)
+    ensure_wkw(target_wkw_info)
 
-        knossos_cubes.sort()
-        for cube_xyz in knossos_cubes:
-            pool.submit(
-                convert_cube_job, cube_xyz, source_knossos_info, target_wkw_info
-            )
+    with open_knossos(source_knossos_info) as source_knossos:
+        with get_executor_for_args(args) as executor:
+            knossos_cubes = list(source_knossos.list_cubes())
+            if len(knossos_cubes) == 0:
+                logging.error("No input KNOSSOS cubes found.")
+                exit(1)
+
+            knossos_cubes.sort()
+            futures = []
+            for cube_xyz in knossos_cubes:
+                futures.append(
+                    executor.submit(
+                        convert_cube_job, cube_xyz, source_knossos_info, target_wkw_info
+                    )
+                )
+            wait_and_ensure_success(futures)
 
 
 if __name__ == "__main__":
@@ -101,4 +109,5 @@ def convert_knossos(source_path, target_path, layer_name, dtype, mag=1, jobs=1):
         args.dtype,
         args.mag,
         args.jobs,
+        args,
     )
diff --git a/wkcuber/cubing.py b/wkcuber/cubing.py
@@ -3,16 +3,18 @@
 import numpy as np
 from argparse import ArgumentParser
 from os import path
+import cluster_tools
 
 from .utils import (
     get_chunks,
     find_files,
     add_verbose_flag,
-    add_jobs_flag,
     open_wkw,
+    ensure_wkw,
     WkwDatasetInfo,
-    ParallelExecutor,
-    pool_get_lock,
+    add_distribution_flags,
+    get_executor_for_args,
+    wait_and_ensure_success,
 )
 from .image_readers import image_reader
 
@@ -51,7 +53,7 @@ def create_parser():
     )
 
     add_verbose_flag(parser)
-    add_jobs_flag(parser)
+    add_distribution_flags(parser)
 
     return parser
 
@@ -91,9 +93,7 @@ def cubing_job(
     if len(z_batches) == 0:
         return
 
-    with open_wkw(
-        target_wkw_info, pool_get_lock(), num_channels=num_channels
-    ) as target_wkw:
+    with open_wkw(target_wkw_info, num_channels=num_channels) as target_wkw:
         # Iterate over batches of continuous z sections
         # The batches have a maximum size of `batch_size`
         # Batched iterations allows to utilize IO more efficiently
@@ -144,7 +144,9 @@ def cubing_job(
                 raise exc
 
 
-def cubing(source_path, target_path, layer_name, dtype, batch_size, jobs) -> dict:
+def cubing(
+    source_path, target_path, layer_name, dtype, batch_size, jobs, args=None
+) -> dict:
 
     target_wkw_info = WkwDatasetInfo(target_path, layer_name, dtype, 1)
     source_files = find_source_filenames(source_path)
@@ -155,23 +157,31 @@ def cubing(source_path, target_path, layer_name, dtype, batch_size, jobs) -> dic
     num_z = len(source_files)
 
     logging.info("Found source files: count={} size={}x{}".format(num_z, num_x, num_y))
-    with ParallelExecutor(jobs) as pool:
+
+    ensure_wkw(target_wkw_info, num_channels=num_channels)
+
+    with get_executor_for_args(args) as executor:
+        futures = []
         # We iterate over all z sections
         for z in range(0, num_z, BLOCK_LEN):
             # Prepare z batches
             max_z = min(num_z, z + BLOCK_LEN)
             z_batch = list(range(z, max_z))
             # Execute
-            pool.submit(
-                cubing_job,
-                target_wkw_info,
-                z_batch,
-                source_files[z:max_z],
-                batch_size,
-                (num_x, num_y),
-                num_channels,
+            futures.append(
+                executor.submit(
+                    cubing_job,
+                    target_wkw_info,
+                    z_batch,
+                    source_files[z:max_z],
+                    batch_size,
+                    (num_x, num_y),
+                    num_channels,
+                )
             )
 
+        wait_and_ensure_success(futures)
+
     # Return Bounding Box
     return {"topLeft": [0, 0, 0], "width": num_x, "height": num_y, "depth": num_z}
 
@@ -189,4 +199,5 @@ def cubing(source_path, target_path, layer_name, dtype, batch_size, jobs) -> dic
         args.dtype,
         args.batch_size,
         args.jobs,
+        args=args,
     )
diff --git a/wkcuber/downsampling.py b/wkcuber/downsampling.py
diff --git a/wkcuber/tile_cubing.py b/wkcuber/tile_cubing.py
diff --git a/wkcuber/utils.py b/wkcuber/utils.py

-Original file line number
+Diff line change
 wkw>=0.0.6
 requests
 black
 +git+git://github.com/scalableminds/[email protected]#egg=cluster_tools