Merge branch '497_ingestion_memory' into v1_ingestion_YOLO

BrianMichell · BrianMichell · commit 5e910426665d · 2025-06-04T15:41:26.000Z
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -131,7 +131,7 @@ def get_compressor(lossless: bool, compression_tolerance: float = -1) -> Blosc |
     return compressor
 
 
-def segy_to_mdio(  # noqa: PLR0913, PLR0915
+def segy_to_mdio(  # noqa: PLR0913, PLR0915, PLR0912
     segy_path: str | Path,
     mdio_path_or_buffer: str | Path,
     index_bytes: Sequence[int],
@@ -394,13 +394,24 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
     grid_density_qc(grid, num_traces)
     grid.build_map(index_headers)
 
-    # Check grid validity by comparing trace numbers
-    if np.sum(grid.live_mask) != num_traces:
+    # Check grid validity by ensuring every trace's header-index is within dimension bounds
+    valid_mask = np.ones(grid.num_traces, dtype=bool)
+    for d_idx in range(len(grid.header_index_arrays)):
+        coords = grid.header_index_arrays[d_idx]
+        valid_mask &= coords < grid.shape[d_idx]
+    valid_count = int(np.count_nonzero(valid_mask))
+    if valid_count != num_traces:
         for dim_name in grid.dim_names:
-            dim_min, dim_max = grid.get_min(dim_name), grid.get_max(dim_name)
+            dim_min = grid.get_min(dim_name)
+            dim_max = grid.get_max(dim_name)
             logger.warning("%s min: %s max: %s", dim_name, dim_min, dim_max)
         logger.warning("Ingestion grid shape: %s.", grid.shape)
-        raise GridTraceCountError(np.sum(grid.live_mask), num_traces)
+        raise GridTraceCountError(valid_count, num_traces)
+
+    import gc
+
+    del valid_mask
+    gc.collect()
 
     if chunksize is None:
         dim_count = len(index_names) + 1
@@ -446,9 +457,66 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
     data_array = data_group[f"chunked_{suffix}"]
     header_array = meta_group[f"chunked_{suffix}_trace_headers"]
 
-    # Write actual live mask and metadata to empty MDIO
-    meta_group["live_mask"][:] = grid.live_mask[:]
-    nonzero_count = np.count_nonzero(grid.live_mask)
+    live_mask_array = meta_group["live_mask"]
+    # 'live_mask_array' has the same first N–1 dims as 'grid.shape[:-1]'
+    # Build a ChunkIterator over the live_mask (no sample axis)
+    from mdio.core.indexing import ChunkIterator
+
+    chunker = ChunkIterator(live_mask_array, chunk_samples=True)
+    for chunk_indices in chunker:
+        # chunk_indices is a tuple of N–1 slice objects
+        trace_ids = grid.get_traces_for_chunk(chunk_indices)
+        if trace_ids.size == 0:
+            # Free memory immediately for empty chunks
+            del trace_ids
+            continue
+
+        # Build a temporary boolean block of shape = chunk shape
+        block = np.zeros(tuple(sl.stop - sl.start for sl in chunk_indices), dtype=bool)
+
+        # Compute local coords within this block for each trace_id
+        local_coords: list[np.ndarray] = []
+        for dim_idx, sl in enumerate(chunk_indices):
+            hdr_arr = grid.header_index_arrays[dim_idx]
+            # Optimize memory usage: hdr_arr and trace_ids are already uint32,
+            # sl.start is int, so result should naturally be int32/uint32.
+            # Avoid unnecessary astype conversion to int64.
+            indexed_coords = hdr_arr[trace_ids]  # uint32 array
+            local_idx = indexed_coords - sl.start  # remains uint32
+            # Free indexed_coords immediately
+            del indexed_coords
+
+            # Only convert dtype if necessary for indexing (numpy requires int for indexing)
+            if local_idx.dtype != np.intp:
+                local_idx = local_idx.astype(np.intp)
+            local_coords.append(local_idx)
+            # local_idx is now owned by local_coords list, safe to continue
+
+        # Free trace_ids as soon as we're done with it
+        del trace_ids
+
+        # Mark live cells in the temporary block
+        block[tuple(local_coords)] = True
+
+        # Free local_coords immediately after use
+        del local_coords
+
+        # Write the entire block to Zarr at once
+        live_mask_array.set_basic_selection(selection=chunk_indices, value=block)
+
+        # Free block immediately after writing
+        del block
+
+        # Force garbage collection periodically to free memory aggressively
+        gc.collect()
+
+    # Final cleanup
+    del live_mask_array
+    del chunker
+    gc.collect()
+
+    nonzero_count = grid.num_traces
+
     write_attribute(name="trace_count", zarr_group=root_group, attribute=nonzero_count)
     write_attribute(name="text_header", zarr_group=meta_group, attribute=text_header.split("\n"))
     write_attribute(name="binary_header", zarr_group=meta_group, attribute=binary_header.to_dict())
diff --git a/src/mdio/core/grid.py b/src/mdio/core/grid.py
@@ -7,14 +7,12 @@
 from typing import TYPE_CHECKING
 
 import numpy as np
-import zarr
 
-from mdio.constants import UINT32_MAX
 from mdio.core import Dimension
 from mdio.core.serialization import Serializer
-from mdio.core.utils_write import get_constrained_chunksize
 
 if TYPE_CHECKING:
+    import zarr
     from segy.arrays import HeaderArray
     from zarr import Array as ZarrArray
 
@@ -65,6 +63,9 @@ def __post_init__(self) -> None:
         self.dim_names = tuple(dim.name for dim in self.dims)
         self.shape = tuple(dim.size for dim in self.dims)
         self.ndim = len(self.dims)
+        # Prepare attributes for lazy mapping; they will be set in build_map
+        self.header_index_arrays: tuple[np.ndarray, ...] = ()
+        self.num_traces: int = 0
 
     def __getitem__(self, item: int) -> Dimension:
         """Get a dimension by index."""
@@ -106,47 +107,62 @@ def from_zarr(cls, zarr_root: zarr.Group) -> Grid:
         return cls(dims_list)
 
     def build_map(self, index_headers: HeaderArray) -> None:
-        """Build trace mapping and live mask from header indices.
+        """Compute per-trace grid coordinates (lazy map).
+
+        Instead of allocating a full `self.map` and `self.live_mask`, this computes, for each trace,
+        its integer index along each dimension (excluding the sample dimension) and stores them in
+        `self.header_index_arrays`. The full mapping can then be derived chunkwise when writing.
 
         Args:
-            index_headers: Header array containing dimension indices.
+            index_headers: Header array containing dimension indices (length = number of traces).
+        """
+        # Number of traces in the SEG-Y
+        self.num_traces = int(index_headers.shape[0])
+
+        # For each dimension except the final sample dimension, compute a 1D array of length
+        # `num_traces` giving each trace's integer coordinate along that axis (via np.searchsorted).
+        # Cast to uint32.
+        idx_arrays: list[np.ndarray] = []
+        for dim in self.dims[:-1]:
+            hdr_vals = index_headers[dim.name]  # shape: (num_traces,)
+            coords = np.searchsorted(dim, hdr_vals)  # integer indices
+            coords = coords.astype(np.uint32)
+            idx_arrays.append(coords)
+
+        # Store as a tuple so that header_index_arrays[d][i] is "trace i's index along axis d"
+        self.header_index_arrays = tuple(idx_arrays)
+
+        # We no longer allocate `self.map` or `self.live_mask` here.
+        # The full grid shape is `self.shape`, but mapping is done lazily per chunk.
+
+    def get_traces_for_chunk(self, chunk_slices: tuple[slice, ...]) -> np.ndarray:
+        """Return all trace IDs whose grid-coordinates fall inside the given chunk slices.
+
+        Args:
+            chunk_slices: Tuple of slice objects, one per grid dimension. For example,
+                          (slice(i0, i1), slice(j0, j1), ...) corresponds to a single Zarr chunk
+                          in index space (excluding the sample axis).
+
+        Returns:
+            A 1D NumPy array of trace indices (0-based) that lie within the hyper-rectangle defined
+            by `chunk_slices`. If no traces fall in this chunk, returns an empty array.
         """
-        # Determine data type for map based on grid size
-        grid_size = np.prod(self.shape[:-1], dtype=np.uint64)
-        map_dtype = np.uint64 if grid_size > UINT32_MAX else np.uint32
-        fill_value = np.iinfo(map_dtype).max
-
-        # Initialize Zarr arrays
-        live_shape = self.shape[:-1]
-        chunks = get_constrained_chunksize(
-            shape=live_shape,
-            dtype=map_dtype,
-            max_bytes=self._INTERNAL_CHUNK_SIZE_TARGET,
-        )
-        self.map = zarr.full(live_shape, fill_value, dtype=map_dtype, chunks=chunks)
-        self.live_mask = zarr.zeros(live_shape, dtype=bool, chunks=chunks)
-
-        # Calculate batch size
-        memory_per_trace_index = index_headers.itemsize
-        batch_size = max(1, int(self._TARGET_MEMORY_PER_BATCH / memory_per_trace_index))
-        total_live_traces = index_headers.size
-
-        # Process headers in batches
-        for start in range(0, total_live_traces, batch_size):
-            end = min(start + batch_size, total_live_traces)
-            live_dim_indices = []
-
-            # Compute indices for the batch
-            for dim in self.dims[:-1]:
-                dim_hdr = index_headers[dim.name][start:end]
-                indices = np.searchsorted(dim, dim_hdr).astype(np.uint32)
-                live_dim_indices.append(indices)
-            live_dim_indices = tuple(live_dim_indices)
-
-            # Assign trace indices
-            trace_indices = np.arange(start, end, dtype=np.uint64)
-            self.map.vindex[live_dim_indices] = trace_indices
-            self.live_mask.vindex[live_dim_indices] = True
+        # Initialize a boolean mask over all traces (shape: (num_traces,))
+        mask = np.ones((self.num_traces,), dtype=bool)
+
+        for dim_idx, sl in enumerate(chunk_slices):
+            arr = self.header_index_arrays[dim_idx]  # shape: (num_traces,)
+            start, stop = sl.start, sl.stop
+            if start is not None:
+                mask &= arr >= start
+            if stop is not None:
+                mask &= arr < stop
+            if not mask.any():
+                # No traces remain after this dimension's filtering
+                return np.empty((0,), dtype=np.uint32)
+
+        # Gather the trace IDs that survived all dimension tests
+        return np.nonzero(mask)[0].astype(np.uint32)
 
 
 class GridSerializer(Serializer):
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -77,45 +77,57 @@ def trace_worker(
     Returns:
         Partial statistics for chunk, or None
     """
-    # Special case where there are no traces inside chunk.
-    live_subset = grid.live_mask[chunk_indices[:-1]]
-
-    if np.count_nonzero(live_subset) == 0:
+    # Determine which trace IDs fall into this chunk
+    trace_ids = grid.get_traces_for_chunk(chunk_indices[:-1])
+    if trace_ids.size == 0:
         return None
 
-    # Let's get trace numbers from grid map using the chunk indices.
-    seq_trace_indices = grid.map[chunk_indices[:-1]]
-
-    tmp_data = np.zeros(seq_trace_indices.shape + (grid.shape[-1],), dtype=data_array.dtype)
-    tmp_metadata = np.zeros(seq_trace_indices.shape, dtype=metadata_array.dtype)
-
-    del grid  # To save some memory
-
-    # Read headers and traces for block
-    valid_indices = seq_trace_indices[live_subset]
-
-    traces = segy_file.trace[valid_indices.tolist()]
+    # Read headers and traces for the selected trace IDs
+    traces = segy_file.trace[trace_ids.tolist()]
     headers, samples = traces["header"], traces["data"]
 
-    tmp_metadata[live_subset] = headers.view(tmp_metadata.dtype)
-    tmp_data[live_subset] = samples
-
-    # Flush metadata to zarr
+    # Build a temporary buffer for data and metadata for this chunk
+    chunk_shape = tuple(sli.stop - sli.start for sli in chunk_indices[:-1]) + (grid.shape[-1],)
+    tmp_data = np.zeros(chunk_shape, dtype=data_array.dtype)
+    meta_shape = tuple(sli.stop - sli.start for sli in chunk_indices[:-1])
+    tmp_metadata = np.zeros(meta_shape, dtype=metadata_array.dtype)
+
+    # Compute local coordinates within the chunk for each trace
+    local_coords: list[np.ndarray] = []
+    for dim_idx, sl in enumerate(chunk_indices[:-1]):
+        hdr_arr = grid.header_index_arrays[dim_idx]
+        # Optimize memory usage: hdr_arr and trace_ids are already uint32,
+        # sl.start is int, so result should naturally be int32/uint32.
+        # Avoid unnecessary astype conversion to int64.
+        indexed_coords = hdr_arr[trace_ids]  # uint32 array
+        local_idx = indexed_coords - sl.start  # remains uint32
+        # Only convert dtype if necessary for indexing (numpy requires int for indexing)
+        if local_idx.dtype != np.intp:
+            local_idx = local_idx.astype(np.intp)
+        local_coords.append(local_idx)
+    full_idx = tuple(local_coords) + (slice(None),)
+
+    # Populate the temporary buffers
+    tmp_data[full_idx] = samples
+    tmp_metadata[tuple(local_coords)] = headers.view(tmp_metadata.dtype)
+
+    # Flush metadata to Zarr
     metadata_array.set_basic_selection(selection=chunk_indices[:-1], value=tmp_metadata)
 
+    # Determine nonzero samples and early-exit if none
     nonzero_mask = samples != 0
-    nonzero_count = nonzero_mask.sum(dtype="uint32")
-
+    nonzero_count = int(nonzero_mask.sum())
     if nonzero_count == 0:
         return None
 
+    # Flush data to Zarr
     data_array.set_basic_selection(selection=chunk_indices, value=tmp_data)
 
     # Calculate statistics
-    tmp_data = samples[nonzero_mask]
-    chunk_sum = tmp_data.sum(dtype="float64")
-    chunk_sum_squares = np.square(tmp_data, dtype="float64").sum()
-    min_val = tmp_data.min()
-    max_val = tmp_data.max()
+    flattened_nonzero = samples[nonzero_mask]
+    chunk_sum = float(flattened_nonzero.sum(dtype="float64"))
+    chunk_sum_squares = float(np.square(flattened_nonzero, dtype="float64").sum())
+    min_val = float(flattened_nonzero.min())
+    max_val = float(flattened_nonzero.max())
 
-    return nonzero_count, chunk_sum, chunk_sum_squares, min_val, max_val
+    return (nonzero_count, chunk_sum, chunk_sum_squares, min_val, max_val)
diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py