Cleanup, mocking, and relocation of autochunking

BrianMichell · BrianMichell · commit d0a0bdf7aa4c · 2025-04-10T14:38:50.000Z
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -17,20 +17,17 @@
 from segy.schema import HeaderField
 
 from mdio.api.io_utils import process_url
-from mdio.constants import INT32_MAX
 from mdio.converters.exceptions import EnvironmentFormatError
 from mdio.converters.exceptions import GridTraceCountError
 from mdio.converters.exceptions import GridTraceSparsityError
 from mdio.core import Grid
+from mdio.core.grid import _calculate_live_mask_chunksize
 from mdio.core.utils_write import write_attribute
 from mdio.segy import blocked_io
 from mdio.segy.compat import mdio_segy_spec
 from mdio.segy.helpers_segy import create_zarr_hierarchy
 from mdio.segy.utilities import get_grid_plan
 
-from dask.array.core import normalize_chunks
-from dask.array.rechunk import _balance_chunksizes
-
 logger = logging.getLogger(__name__)
 
 try:
@@ -499,153 +496,3 @@ def segy_to_mdio(  # noqa: C901
     )
 
     zarr.consolidate_metadata(store_nocache)
-
-
-def _calculate_live_mask_chunksize(grid: Grid) -> Sequence[int]:
-    """Calculate the optimal chunksize for the live mask.
-
-    Args:
-        grid: The grid to calculate the chunksize for.
-
-    Returns:
-        A sequence of integers representing the optimal chunk size for each dimension
-        of the grid.
-    """
-    return _calculate_optimal_chunksize(grid.live_mask, INT32_MAX//4)
-
-
-def _calculate_optimal_chunksize(  # noqa: C901
-    volume: np.ndarray | zarr.Array, n_bytes: int
-) -> Sequence[int]:
-    """Calculate the optimal chunksize for an N-dimensional data volume.
-
-    Args:
-        volume: The volume to calculate the chunksize for.
-        n_bytes: The maximum allowed number of bytes per chunk.
-
-    Returns:
-        A sequence of integers representing the optimal chunk size for each dimension
-        of the grid.
-    """
-    shape = volume.shape
-    chunks = normalize_chunks(
-        "auto",
-        shape,
-        dtype=volume.dtype,
-        limit=n_bytes,
-    )
-    return tuple(_balance_chunksizes(chunk)[0] for chunk in chunks)
-
-
-
-#     0. The product of the chunk dimensions multiplied by the element size does not
-#        exceed n_bytes.
-#     1. The chunk shape is "regular" – each chunk dimension is a divisor of the
-#        overall volume shape.
-#     2. If an exact match is impossible, the chunk shape chosen maximizes the number of
-#        elements (minimizing the unused bytes).
-#     3. The computation is efficient.
-
-#     The computation efficiency is broken down as follows:
-
-#     - Divisor Computation: For each of the N dimensions (assume size ~ n), it checks
-#       up to n numbers, so this part is roughly O(N * n).
-#       For example, if you have a 3D array where each dimension is about 100,
-#       it does around 3*100 = 300 steps.
-#     - DFS Search: In the worst-case, the DFS explores about D choices per dimension
-#       (D = average number of divisors) leading to O(D^N) combinations.
-#       In practice, D is small (often < 10), so for a 2D array this is around 10^2
-#       (about 100 combinations) and for a 3D array about 10^3 (roughly 1,000 combinations).
-#       Since N is typically small (often <6), this exponential term behaves like a
-#       constant factor.
-
-#     Args:
-#       volume : np.ndarray | zarr.Array
-#           An N-dimensional array-like object (e.g. np.ndarray or zarr array).
-#       n_bytes : int
-#           Maximum allowed number of bytes per chunk (>= 1).
-
-#     Returns:
-#       Sequence[int]
-#           A tuple representing the optimal chunk shape (number of elements along each axis).
-
-#     Raises:
-#       ValueError if n_bytes is less than the number of bytes of one element.
-#     """
-#     # Get volume shape and element size.
-#     shape = volume.shape
-
-#     if volume.size == 0:
-#         logging.warning("Chunking calculation received empty volume shape...")
-#         return volume.shape
-
-#     itemsize = volume.dtype.itemsize
-
-#     # Maximum number of elements that can fit in a chunk
-#     # (we ignore any extra bytes; must not exceed n_bytes).
-#     max_elements_allowed = n_bytes // itemsize
-#     if max_elements_allowed < 1:
-#         raise ValueError("n_bytes is too small to hold even one element of the volume.")
-
-#     n_dims = len(shape)
-
-#     def get_divisors(n: int) -> list[int]:
-#         """Return a sorted list of all positive divisors of n.
-
-#         Args:
-#             n: The number to compute the divisors of.
-
-#         Returns:
-#             A sorted list of all positive divisors of n.
-#         """
-#         divs = []
-#         # It is efficient enough for typical dimension sizes.
-#         for i in range(1, n + 1):
-#             if n % i == 0:
-#                 divs.append(i)
-#         return sorted(divs)
-
-#     # For each dimension, compute the list of allowed chunk sizes (divisors).
-#     divisors_list = [get_divisors(d) for d in shape]
-
-#     # For pruning: precompute the maximum possible product achievable from axis i to N-1.
-#     # This is the product of the maximum divisors for each remaining axis.
-#     max_possible = [1] * (n_dims + 1)
-#     for i in range(n_dims - 1, -1, -1):
-#         max_possible[i] = max(divisors_list[i]) * max_possible[i + 1]
-
-#     best_product = 0
-#     best_combination = [None] * n_dims
-#     current_chunk = [None] * n_dims
-
-#     def dfs(dim: int, current_product: int) -> None:
-#         """Depth-first search to find the optimal chunk shape.
-
-#         Args:
-#             dim: The current dimension to process.
-#             current_product: The current product of the chunk dimensions.
-#         """
-#         nonlocal best_product
-#         # If all dimensions have been processed, update best combination if needed.
-#         if dim == n_dims:
-#             if current_product > best_product:
-#                 best_product = current_product
-#                 best_combination[:] = current_chunk[:]
-#             return
-
-#         # Prune branches: even if we take the maximum allowed for all remaining dimensions,
-#         # if we cannot exceed best_product, then skip.
-#         if current_product * max_possible[dim] < best_product:
-#             return
-
-#         # Iterate over allowed divisors for the current axis,
-#         # trying larger candidates first so that high products are found early.
-#         for candidate in sorted(divisors_list[dim], reverse=True):
-#             new_product = current_product * candidate
-#             if new_product > max_elements_allowed:
-#                 continue  # This candidate would exceed the byte restriction.
-#             current_chunk[dim] = candidate
-#             dfs(dim + 1, new_product)
-
-#     dfs(0, 1)
-#     return tuple(best_combination)
diff --git a/src/mdio/core/grid.py b/src/mdio/core/grid.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
+
 import inspect
 import logging
 from dataclasses import dataclass
@@ -11,6 +13,11 @@
 
 from mdio.constants import UINT32_MAX
 from mdio.constants import UINT64_MAX
+from mdio.constants import INT32_MAX
+
+from dask.array.core import normalize_chunks
+from dask.array.rechunk import _balance_chunksizes
+
 from mdio.core import Dimension
 from mdio.core.serialization import Serializer
 
@@ -134,3 +141,40 @@ def deserialize(self, stream: str) -> Grid:
         payload = self.validate_payload(payload, signature)
 
         return Grid(**payload)
+
+
+def _calculate_live_mask_chunksize(grid: Grid) -> Sequence[int]:
+    """Calculate the optimal chunksize for the live mask.
+
+    Args:
+        grid: The grid to calculate the chunksize for.
+
+    Returns:
+        A sequence of integers representing the optimal chunk size for each dimension
+        of the grid.
+    """
+    return _calculate_optimal_chunksize(grid.live_mask, INT32_MAX//4)
+
+
+def _calculate_optimal_chunksize(  # noqa: C901
+    volume: np.ndarray | zarr.Array,
+    max_bytes: int
+) -> Sequence[int]:
+    """Calculate the optimal chunksize for an N-dimensional data volume.
+
+    Args:
+        volume: The volume to calculate the chunksize for.
+        n_bytes: The maximum allowed number of bytes per chunk.
+
+    Returns:
+        A sequence of integers representing the optimal chunk size for each dimension
+        of the grid.
+    """
+    shape = volume.shape
+    chunks = normalize_chunks(
+        "auto",
+        shape,
+        dtype=volume.dtype,
+        limit=max_bytes,
+    )
+    return tuple(_balance_chunksizes(chunk)[0] for chunk in chunks)
diff --git a/tests/unit/test_auto_chunking.py b/tests/unit/test_auto_chunking.py
@@ -3,10 +3,10 @@
 import numpy as np
 import pytest
 
-from mdio.converters.segy import _calculate_live_mask_chunksize
-from mdio.converters.segy import _calculate_optimal_chunksize
 from mdio.core import Dimension
 from mdio.core import Grid
+from mdio.core.grid import _calculate_live_mask_chunksize
+from mdio.core.grid import _calculate_optimal_chunksize
 
 
 class MockArray:
@@ -222,9 +222,7 @@ def test_altay():
         grid = Grid(dims=dims)
         grid.live_mask = MockArray(shape, bool)
 
-        # Calculate chunk size using the live mask function
         result = _calculate_live_mask_chunksize(grid)
-        print(f"{kind}: {result}")
 
         # Verify that the chunk size is valid
         assert all(chunk > 0 for chunk in result), f"Invalid chunk size for {kind}"
@@ -235,5 +233,6 @@ def test_altay():
         if kind in ["right_above_2G", "above_2G_v2", "above_2G_v2_asym", "above_4G_v2_asym", "above_3G_4D_asym"]:
             # TODO(BrianMichell): Our implementation is taking "limit" pretty liberally.
             # This is not overtly an issue because we are well below the 2GiB limit, but it's indicative of an underlying issue.
-            continue
-        assert chunk_elements <= INT32_MAX // 4, f"Chunk too large for {kind}"
+            assert chunk_elements <= (INT32_MAX // 4) * 1.5, f"Chunk too large for {kind}"
+        else:
+            assert chunk_elements <= INT32_MAX // 4, f"Chunk too large for {kind}"