Merge pull request #152 from HiPCTProject/memory-downsample

dstansby · web-flow · commit 9a3f1b472761 · 2025-11-07T10:38:46.000+01:00
Add function to get memory for downsampling
diff --git a/docs/index.rst b/docs/index.rst
@@ -35,3 +35,14 @@ This is easy to do with `virtual environments <https://docs.astral.sh/uv/pip/env
 Changelog
 ---------
 See https://github.com/HiPCTProject/stack-to-chunk/releases for the list of tags and a changelog for each one.
+
+Version 2
+~~~~~~~~~
+
+Version 2 of ``stack-to-chunk`` is a major breaking release to add support for OME-Zarr version 0.5 (and therefore Zarr version 3).
+The following are major changes to the library:
+
+- ``stack-to-chunk`` automatically adds sharding to the resulting data.
+   See the guide page for more information on how this works.
+- ``memory_per_process`` has been renamed ``memory_per_slab_process``, as it calculates the memory required to copy a single slab of data.
+- A new function, ``memory_per_downsample_process`` has also been added to calculate the minimum memory required in a downsampling process.
diff --git a/docs/tutorial/tutorial.py b/docs/tutorial/tutorial.py
@@ -93,7 +93,7 @@
 # without any downsampling. Before doing this lets do a quick check of how much memory
 # each process will take up when we run stack-to-chunk:
 
-bytes_per_process = stack_to_chunk.memory_per_process(images, chunk_size=16)
+bytes_per_process = stack_to_chunk.memory_per_slab_process(images, chunk_size=16)
 print(f"Each process will use {bytes_per_process / 1e6:.1f} MB")
 
 
diff --git a/src/stack_to_chunk/__init__.py b/src/stack_to_chunk/__init__.py
@@ -4,14 +4,20 @@
     "SPATIAL_UNIT",
     "MultiScaleGroup",
     "__version__",
-    "memory_per_process",
+    "memory_per_downsample_process",
+    "memory_per_slab_process",
     "open_multiscale_group",
 ]
 
 from loguru import logger
 
 from ._version import __version__
-from .main import MultiScaleGroup, memory_per_process, open_multiscale_group
+from .main import (
+    MultiScaleGroup,
+    memory_per_downsample_process,
+    memory_per_slab_process,
+    open_multiscale_group,
+)
 from .ome_ngff import SPATIAL_UNIT
 
 logger.disable("stack_to_chunk")
diff --git a/src/stack_to_chunk/main.py b/src/stack_to_chunk/main.py
@@ -23,9 +23,9 @@
 DEFAULT_DIMENSION_NAMES = ("x", "y", "z")
 
 
-def memory_per_process(input_data: Array, *, chunk_size: int) -> int:
+def memory_per_slab_process(input_data: Array, *, chunk_size: int) -> int:
     """
-    The amount of memory each stack-to-chunk process will use (in bytes).
+    The amount of memory each stack-to-chunk slab copying process will use (in bytes).
 
     This is a lower bound on memory use, equal to the size of a slab of data with size
     (nx, ny, chunk_size), where (nx, ny) is the shape of a single input
@@ -35,6 +35,17 @@ def memory_per_process(input_data: Array, *, chunk_size: int) -> int:
     return int(input_data.shape[0] * input_data.shape[1] * itemsize * chunk_size)
 
 
+def memory_per_downsample_process(input_group: "MultiScaleGroup") -> int:
+    """
+    The amount of memory each stack-to-chunk downsampling process will use (in bytes).
+
+    This is a lower bound on memory use.
+    """
+    source_arr: zarr.Array = input_group._group["0"]  # noqa: SLF001
+    mem_per_slab = memory_per_slab_process(source_arr, chunk_size=source_arr.shards[2])
+    return math.ceil(mem_per_slab * 5 / 8)
+
+
 class MultiScaleGroup:
     """
     A class for creating and interacting with a OME-Zarr multi-scale group.
diff --git a/src/stack_to_chunk/tests/test_main.py b/src/stack_to_chunk/tests/test_main.py
@@ -13,7 +13,12 @@
 import zarr.codecs
 from pydantic_zarr.v3 import ArraySpec, NamedConfig
 
-from stack_to_chunk import MultiScaleGroup, memory_per_process, open_multiscale_group
+from stack_to_chunk import (
+    MultiScaleGroup,
+    memory_per_slab_process,
+    open_multiscale_group,
+)
+from stack_to_chunk.main import memory_per_downsample_process
 
 
 def check_zattrs(zarr_path: Path, expected: dict[str, Any]) -> None:
@@ -153,7 +158,7 @@ def test_workflow(tmp_path: Path, arr: da.Array) -> None:
         },
     )
 
-    assert memory_per_process(arr, chunk_size=chunk_size) == 18282880
+    assert memory_per_slab_process(arr, chunk_size=chunk_size) == 18282880
     group.add_full_res_data(
         arr,
         n_processes=1,
@@ -172,6 +177,7 @@ def test_workflow(tmp_path: Path, arr: da.Array) -> None:
     group = open_multiscale_group(zarr_path)
     assert group.levels == [0]
 
+    assert memory_per_downsample_process(group) == 11426800
     group.add_downsample_level(1, n_processes=2)
     assert group.levels == [0, 1]