Automatic chunking of live_maks for grids that exceed blosc's maximum elements.

BrianMichell · BrianMichell · commit ac21d1c8e1c5 · 2025-04-08T15:51:12.000Z
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -17,6 +17,7 @@
 from segy.schema import HeaderField
 
 from mdio.api.io_utils import process_url
+from mdio.constants import INT32_MAX
 from mdio.converters.exceptions import EnvironmentFormatError
 from mdio.converters.exceptions import GridTraceCountError
 from mdio.converters.exceptions import GridTraceSparsityError
@@ -116,7 +117,6 @@ def segy_to_mdio(  # noqa: C901
     storage_options_output: dict[str, Any] | None = None,
     overwrite: bool = False,
     grid_overrides: dict | None = None,
-    live_mask_chunksize: Sequence[int] | None = None,
 ) -> None:
     """Convert SEG-Y file to MDIO format.
 
@@ -171,16 +171,10 @@ def segy_to_mdio(  # noqa: C901
             Default is `None` (will assume anonymous)
         overwrite: Toggle for overwriting existing store
         grid_overrides: Option to add grid overrides. See examples.
-        live_mask_chunksize: Chunk size for live mask. This has limited
-            support across the MDIO api.
-            Default is `None` (will do no chunking)
-
     Raises:
         GridTraceCountError: Raised if grid won't hold all traces in the
             SEG-Y file.
-        ValueError: If length of chunk sizes don't match number of dimensions
-            or live_mask_chunksize is not None and lenght of live_mask_chunksize
-            is not equal to number of dimensions minus one.
+        ValueError: If length of chunk sizes don't match number of dimensions.
         NotImplementedError: If can't determine chunking automatically for 4D+.
 
     Examples:
@@ -348,20 +342,6 @@ def segy_to_mdio(  # noqa: C901
         ...     chunksize=(8, 2, 256, 512),
         ...     grid_overrides={"HasDuplicates": True},
         ... )
-
-        >>> segy_to_mdio(
-        ...     segy_path="prefix/shot_file.segy",
-        ...     mdio_path_or_buffer="s3://bucket/shot_file.mdio",
-        ...     index_bytes=(133, 171, 17, 137, 13),
-        ...     index_lengths=(2, 2, 4, 2, 4),
-        ...     index_names=("shot_line", "gun", "shot_point", "cable", "channel"),
-        ...     chunksize=(1, 1, 8, 1, 128, 1024),
-        ...     grid_overrides={
-        ...         "AutoShotWrap": True,
-        ...         "AutoChannelWrap": True,
-        ...         "AutoChannelTraceQC":  1000000
-        ...     },
-        ...     live_mask_chunksize=(1, 1, 8, 1, 128),
     """
     if index_names is None:
         index_names = [f"dim_{i}" for i in range(len(index_bytes))]
@@ -377,14 +357,6 @@ def segy_to_mdio(  # noqa: C901
             )
             raise ValueError(message)
 
-    if live_mask_chunksize is not None:
-        if len(live_mask_chunksize) != len(index_bytes):
-            message = (
-                f"Length of live_mask_chunksize={len(live_mask_chunksize)} must be ",
-                f"equal to array dimensions={len(index_bytes)}",
-            )
-            raise ValueError(message)
-
     # Handle storage options and check permissions etc
     if storage_options_input is None:
         storage_options_input = {}
@@ -452,8 +424,7 @@ def segy_to_mdio(  # noqa: C901
     trace_count = np.count_nonzero(grid.live_mask)
     write_attribute(name="trace_count", zarr_group=zarr_root, attribute=trace_count)
 
-    if live_mask_chunksize is None:
-        live_mask_chunksize = -1
+    live_mask_chunksize = _calculate_live_mask_chunksize(grid)
 
     # Note, live mask is not chunked since it's bool and small.
     zarr_root["metadata"].create_dataset(
@@ -525,3 +496,39 @@ def segy_to_mdio(  # noqa: C901
     )
 
     zarr.consolidate_metadata(store_nocache)
+
+
+def _calculate_live_mask_chunksize(grid: Grid) -> Sequence[int] | int:
+    """Calculate the optimal chunksize for the live mask.
+
+    Args:
+        grid: The grid to calculate the chunksize for.
+    """
+    if np.sum(grid.live_mask) < INT32_MAX:
+        # Base case where we don't need to chunk the live mask
+        return -1
+
+    # Calculate the optimal chunksize for the live mask
+    total_elements = np.prod(grid.shape[:-1])  # Exclude sample dimension
+    num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
+
+    # Calculate chunk size for each dimension
+    chunks = []
+    remaining_elements = total_elements
+
+    for dim_size in grid.shape[:-1]:  # Exclude sample dimension
+        # Calculate how many chunks we need in this dimension
+        # We want to distribute chunks evenly across dimensions
+        dim_chunks = max(
+            1,
+            int(
+                np.ceil(
+                    dim_size / np.ceil(np.power(num_chunks, 1 / len(grid.shape[:-1])))
+                )
+            ),
+        )
+        chunk_size = int(np.ceil(dim_size / dim_chunks))
+        chunks.append(chunk_size)
+        remaining_elements //= dim_chunks
+
+    return tuple(chunks)
diff --git a/tests/unit/test_live_mask_chunksize.py b/tests/unit/test_live_mask_chunksize.py
@@ -0,0 +1,140 @@
+"""Test live mask chunk size calculation."""
+
+import numpy as np
+import pytest
+
+from mdio.converters.segy import _calculate_live_mask_chunksize
+from mdio.core import Grid, Dimension
+from mdio.constants import INT32_MAX
+
+
+def test_small_grid_no_chunking():
+    """Test that small grids return -1 (no chunking needed)."""
+    # Create a small grid that fits within INT32_MAX
+    dims = [
+        Dimension(coords=range(0, 100, 1), name="dim1"),
+        Dimension(coords=range(0, 100, 1), name="dim2"),
+        Dimension(coords=range(0, 100, 1), name="sample")
+    ]
+    grid = Grid(dims=dims)
+    grid.live_mask = np.ones((100, 100), dtype=bool)
+    
+    result = _calculate_live_mask_chunksize(grid)
+    assert result == -1
+
+
+def test_large_2d_grid_chunking():
+    """Test exact chunk size calculation for a 2D grid that exceeds INT32_MAX."""
+    # Create a grid that exceeds INT32_MAX (2,147,483,647)
+    # Using 50,000 x 50,000 = 2,500,000,000 elements
+    dims = [
+        Dimension(coords=range(0, 50000, 1), name="dim1"),
+        Dimension(coords=range(0, 50000, 1), name="dim2"),
+        Dimension(coords=range(0, 100, 1), name="sample")
+    ]
+    grid = Grid(dims=dims)
+    grid.live_mask = np.ones((50000, 50000), dtype=bool)
+    
+    result = _calculate_live_mask_chunksize(grid)
+    
+    # Calculate expected values
+    total_elements = 50000 * 50000
+    num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
+    dim_chunks = int(np.ceil(50000 / np.ceil(np.power(num_chunks, 1/2))))
+    expected_chunk_size = int(np.ceil(50000 / dim_chunks))
+    
+    assert result == (expected_chunk_size, expected_chunk_size)
+
+
+def test_large_3d_grid_chunking():
+    """Test exact chunk size calculation for a 3D grid that exceeds INT32_MAX."""
+    # Create a 3D grid that exceeds INT32_MAX
+    # Using 1500 x 1500 x 1500 = 3,375,000,000 elements
+    dims = [
+        Dimension(coords=range(0, 1500, 1), name="dim1"),
+        Dimension(coords=range(0, 1500, 1), name="dim2"),
+        Dimension(coords=range(0, 1500, 1), name="dim3"),
+        Dimension(coords=range(0, 100, 1), name="sample")
+    ]
+    grid = Grid(dims=dims)
+    grid.live_mask = np.ones((1500, 1500, 1500), dtype=bool)
+    
+    result = _calculate_live_mask_chunksize(grid)
+    
+    # Calculate expected values
+    total_elements = 1500 * 1500 * 1500
+    num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
+    dim_chunks = int(np.ceil(1500 / np.ceil(np.power(num_chunks, 1/3))))
+    expected_chunk_size = int(np.ceil(1500 / dim_chunks))
+    
+    assert result == (expected_chunk_size, expected_chunk_size, expected_chunk_size)
+
+
+def test_uneven_dimensions_chunking():
+    """Test exact chunk size calculation for uneven dimensions."""
+    # Create a grid with uneven dimensions that exceeds INT32_MAX
+    # Using 50,000 x 50,000 = 2,500,000,000 elements (exceeds INT32_MAX)
+    # But with uneven chunking: 50,000 x 25,000
+    dims = [
+        Dimension(coords=range(0, 50000, 1), name="dim1"),
+        Dimension(coords=range(0, 50000, 1), name="dim2"),
+        Dimension(coords=range(0, 100, 1), name="sample")
+    ]
+    grid = Grid(dims=dims)
+    grid.live_mask = np.ones((50000, 50000), dtype=bool)
+    
+    result = _calculate_live_mask_chunksize(grid)
+    
+    # Calculate expected values
+    total_elements = 50000 * 50000
+    num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
+    dim_chunks = int(np.ceil(50000 / np.ceil(np.power(num_chunks, 1/2))))
+    expected_chunk_size = int(np.ceil(50000 / dim_chunks))
+    
+    assert result == (expected_chunk_size, expected_chunk_size)
+
+
+def test_prestack_land_survey_chunking():
+    """Test exact chunk size calculation for a dense pre-stack land survey grid."""
+    # Create a dense pre-stack land survey grid that exceeds INT32_MAX
+    # Using realistic dimensions:
+    # - 1000 shot points
+    # - 1000 receiver points
+    # - 100 offsets
+    # - 36 azimuths
+    # Total elements: 1000 * 1000 * 100 * 36 = 3,600,000,000 elements
+    dims = [
+        Dimension(coords=range(0, 1000, 1), name="shot_point"),
+        Dimension(coords=range(0, 1000, 1), name="receiver_point"),
+        Dimension(coords=range(0, 100, 1), name="offset"),
+        Dimension(coords=range(0, 36, 1), name="azimuth"),
+        Dimension(coords=range(0, 1000, 1), name="sample")
+    ]
+    grid = Grid(dims=dims)
+    grid.live_mask = np.ones((1000, 1000, 100, 36), dtype=bool)
+    
+    result = _calculate_live_mask_chunksize(grid)
+    
+    # Calculate expected values
+    total_elements = 1000 * 1000 * 100 * 36
+    num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
+    dim_chunks = int(np.ceil(1000 / np.ceil(np.power(num_chunks, 1/4))))
+    expected_chunk_size = int(np.ceil(1000 / dim_chunks))
+    
+    # For a 4D grid, we expect chunk sizes to be distributed across all dimensions
+    # The chunk size should be the same for all dimensions since they're all equally important
+    assert result == (expected_chunk_size, expected_chunk_size, expected_chunk_size, expected_chunk_size)
+
+
+def test_edge_case_empty_grid():
+    """Test empty grid edge case."""
+    dims = [
+        Dimension(coords=range(0, 0, 1), name="dim1"),
+        Dimension(coords=range(0, 0, 1), name="dim2"),
+        Dimension(coords=range(0, 100, 1), name="sample")
+    ]
+    grid = Grid(dims=dims)
+    grid.live_mask = np.zeros((0, 0), dtype=bool)
+    
+    result = _calculate_live_mask_chunksize(grid)
+    assert result == -1  # Empty grid shouldn't need chunking