Begin testing for lazy compute of grid

BrianMichell · BrianMichell · commit d27879f5fdea · 2025-06-02T16:15:29.000Z
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -354,6 +354,7 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
         ...     grid_overrides={"HasDuplicates": True},
         ... )
     """
+    print("Entering segy_to_mdio")
     index_names = index_names or [f"dim_{i}" for i in range(len(index_bytes))]
     index_types = index_types or ["int32"] * len(index_bytes)
 
@@ -368,6 +369,7 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
     storage_options_input = storage_options_input or {}
     storage_options_output = storage_options_output or {}
 
+    print("pre-setup")
     # Open SEG-Y with MDIO's SegySpec. Endianness will be inferred.
     mdio_spec = mdio_segy_spec()
     segy_settings = SegySettings(storage_options=storage_options_input)
@@ -377,31 +379,43 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
     binary_header = segy.binary_header
     num_traces = segy.num_traces
 
+    print("pre-index")
     # Index the dataset using a spec that interprets the user provided index headers.
-    index_fields = []
+    index_fields: list[HeaderField] = []
     for name, byte, format_ in zip(index_names, index_bytes, index_types, strict=True):
         index_fields.append(HeaderField(name=name, byte=byte, format=format_))
     mdio_spec_grid = mdio_spec.customize(trace_header_fields=index_fields)
     segy_grid = SegyFile(url=segy_path, spec=mdio_spec_grid, settings=segy_settings)
 
+    print("pre-get_grid_plan")
     dimensions, chunksize, index_headers = get_grid_plan(
         segy_file=segy_grid,
         return_headers=True,
         chunksize=chunksize,
         grid_overrides=grid_overrides,
     )
     grid = Grid(dims=dimensions)
+    print("pre-grid_density_qc")
     grid_density_qc(grid, num_traces)
+    print("pre-build_map")
     grid.build_map(index_headers)
 
-    # Check grid validity by comparing trace numbers
-    if np.sum(grid.live_mask) != num_traces:
+    print("pre-valid_mask")
+    # Check grid validity by ensuring every trace's header-index is within dimension bounds
+    valid_mask = np.ones(grid.num_traces, dtype=bool)
+    for d_idx in range(len(grid.header_index_arrays)):
+        coords = grid.header_index_arrays[d_idx]
+        valid_mask &= (coords < grid.shape[d_idx])
+    valid_count = int(np.count_nonzero(valid_mask))
+    if valid_count != num_traces:
         for dim_name in grid.dim_names:
-            dim_min, dim_max = grid.get_min(dim_name), grid.get_max(dim_name)
+            dim_min = grid.get_min(dim_name)
+            dim_max = grid.get_max(dim_name)
             logger.warning("%s min: %s max: %s", dim_name, dim_min, dim_max)
         logger.warning("Ingestion grid shape: %s.", grid.shape)
-        raise GridTraceCountError(np.sum(grid.live_mask), num_traces)
+        raise GridTraceCountError(valid_count, num_traces)
 
+    print("pre-chunksize")
     if chunksize is None:
         dim_count = len(index_names) + 1
         if dim_count == 2:  # noqa: PLR2004
@@ -424,6 +438,7 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
         suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None]
         suffix = "".join(suffix)
 
+    print("pre-compressors")
     compressors = get_compressor(lossless, compression_tolerance)
     header_dtype = segy.spec.trace.header.dtype.newbyteorder("=")
     var_conf = MDIOVariableConfig(
@@ -435,6 +450,7 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
     )
     config = MDIOCreateConfig(path=mdio_path_or_buffer, grid=grid, variables=[var_conf])
 
+    print("pre-create_empty")
     root_group = create_empty(
         config,
         overwrite=overwrite,
@@ -446,23 +462,61 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915
     data_array = data_group[f"chunked_{suffix}"]
     header_array = meta_group[f"chunked_{suffix}_trace_headers"]
 
-    # Write actual live mask and metadata to empty MDIO
-    meta_group["live_mask"][:] = grid.live_mask[:]
-    nonzero_count = np.count_nonzero(grid.live_mask)
+    print("pre-live_mask")
+    live_mask_array = meta_group["live_mask"]
+    # 'live_mask_array' has the same first N–1 dims as 'grid.shape[:-1]'
+    # Build a ChunkIterator over the live_mask (no sample axis)
+    from mdio.core.indexing import ChunkIterator
+
+    chunker = ChunkIterator(live_mask_array, chunk_samples=False)
+    for chunk_indices in chunker:
+        # chunk_indices is a tuple of N–1 slice objects
+        trace_ids = grid.get_traces_for_chunk(chunk_indices)
+        if trace_ids.size == 0:
+            continue
+
+        # Build a temporary boolean block of shape = chunk shape
+        block_shape = tuple(sl.stop - sl.start for sl in chunk_indices)
+        block = np.zeros(block_shape, dtype=bool)
+
+        # Compute local coords within this block for each trace_id
+        local_coords: list[np.ndarray] = []
+        for dim_idx, sl in enumerate(chunk_indices):
+            hdr_arr = grid.header_index_arrays[dim_idx]
+            local_idx = (hdr_arr[trace_ids] - sl.start).astype(int)
+            local_coords.append(local_idx)
+
+        # Mark live cells in the temporary block
+        block[tuple(local_coords)] = True
+
+        # Write the entire block to Zarr at once
+        live_mask_array.set_basic_selection(selection=chunk_indices, value=block)
+
+    nonzero_count = grid.num_traces
+
+    print("pre-write_attribute")
     write_attribute(name="trace_count", zarr_group=root_group, attribute=nonzero_count)
     write_attribute(name="text_header", zarr_group=meta_group, attribute=text_header.split("\n"))
     write_attribute(name="binary_header", zarr_group=meta_group, attribute=binary_header.to_dict())
 
+    print("pre-to_zarr")
     # Write traces
+    zarr_root = mdio_path_or_buffer  # the same path you passed earlier to create_empty
+    data_var = f"data/chunked_{suffix}"
+    header_var = f"metadata/chunked_{suffix}_trace_headers"
+
     stats = blocked_io.to_zarr(
         segy_file=segy,
         grid=grid,
-        data_array=data_array,
-        header_array=header_array,
+        zarr_root_path=zarr_root,
+        data_var_path=data_var,
+        header_var_path=header_var,
     )
 
+    print("pre-write_attribute")
     # Write actual stats
     for key, value in stats.items():
         write_attribute(name=key, zarr_group=root_group, attribute=value)
 
-    zarr.consolidate_metadata(root_group.store)
+    print("pre-consolidate_metadata")
+    zarr.consolidate_metadata(root_group.store)
diff --git a/src/mdio/core/grid.py b/src/mdio/core/grid.py
@@ -65,6 +65,9 @@ def __post_init__(self) -> None:
         self.dim_names = tuple(dim.name for dim in self.dims)
         self.shape = tuple(dim.size for dim in self.dims)
         self.ndim = len(self.dims)
+        # Prepare attributes for lazy mapping; they will be set in build_map
+        self.header_index_arrays: tuple[np.ndarray, ...] = ()
+        self.num_traces: int = 0
 
     def __getitem__(self, item: int) -> Dimension:
         """Get a dimension by index."""
@@ -106,47 +109,64 @@ def from_zarr(cls, zarr_root: zarr.Group) -> Grid:
         return cls(dims_list)
 
     def build_map(self, index_headers: HeaderArray) -> None:
-        """Build trace mapping and live mask from header indices.
+        """Compute per-trace grid coordinates (lazy map).
+
+        Instead of allocating a full `self.map` and `self.live_mask`, this computes, for each trace,
+        its integer index along each dimension (excluding the final sample dimension) and stores them in
+        `self.header_index_arrays`. The full mapping can then be derived chunk-by-chunk when writing.
 
         Args:
-            index_headers: Header array containing dimension indices.
+            index_headers: Header array containing dimension indices (length = number of traces).
+        """
+        # Number of traces in the SEG-Y
+        self.num_traces = int(index_headers.shape[0])
+
+        # For each dimension except the final sample dimension, compute a 1D array of length
+        # `num_traces` giving each trace's integer coordinate along that axis (via np.searchsorted).
+        # Cast to uint32.
+        idx_arrays: list[np.ndarray] = []
+        for dim in self.dims[:-1]:
+            hdr_vals = index_headers[dim.name]         # shape: (num_traces,)
+            coords = np.searchsorted(dim, hdr_vals)    # integer indices
+            coords = coords.astype(np.uint32)
+            idx_arrays.append(coords)
+
+        # Store as a tuple so that header_index_arrays[d][i] is "trace i's index along axis d"
+        self.header_index_arrays = tuple(idx_arrays)
+
+        # We no longer allocate `self.map` or `self.live_mask` here.
+        # The full grid shape is `self.shape`, but mapping is done lazily per chunk.
+        return
+
+    def get_traces_for_chunk(self, chunk_slices: tuple[slice, ...]) -> np.ndarray:
+        """Return all trace IDs whose grid-coordinates fall inside the given chunk slices.
+
+        Args:
+            chunk_slices: Tuple of slice objects, one per grid dimension. For example,
+                          (slice(i0, i1), slice(j0, j1), ...) corresponds to a single Zarr chunk
+                          in index space (excluding the sample axis).
+
+        Returns:
+            A 1D NumPy array of trace indices (0-based) that lie within the hyper-rectangle defined
+            by `chunk_slices`. If no traces fall in this chunk, returns an empty array.
         """
-        # Determine data type for map based on grid size
-        grid_size = np.prod(self.shape[:-1], dtype=np.uint64)
-        map_dtype = np.uint64 if grid_size > UINT32_MAX else np.uint32
-        fill_value = np.iinfo(map_dtype).max
-
-        # Initialize Zarr arrays
-        live_shape = self.shape[:-1]
-        chunks = get_constrained_chunksize(
-            shape=live_shape,
-            dtype=map_dtype,
-            max_bytes=self._INTERNAL_CHUNK_SIZE_TARGET,
-        )
-        self.map = zarr.full(live_shape, fill_value, dtype=map_dtype, chunks=chunks)
-        self.live_mask = zarr.zeros(live_shape, dtype=bool, chunks=chunks)
-
-        # Calculate batch size
-        memory_per_trace_index = index_headers.itemsize
-        batch_size = max(1, int(self._TARGET_MEMORY_PER_BATCH / memory_per_trace_index))
-        total_live_traces = index_headers.size
-
-        # Process headers in batches
-        for start in range(0, total_live_traces, batch_size):
-            end = min(start + batch_size, total_live_traces)
-            live_dim_indices = []
-
-            # Compute indices for the batch
-            for dim in self.dims[:-1]:
-                dim_hdr = index_headers[dim.name][start:end]
-                indices = np.searchsorted(dim, dim_hdr).astype(np.uint32)
-                live_dim_indices.append(indices)
-            live_dim_indices = tuple(live_dim_indices)
-
-            # Assign trace indices
-            trace_indices = np.arange(start, end, dtype=np.uint64)
-            self.map.vindex[live_dim_indices] = trace_indices
-            self.live_mask.vindex[live_dim_indices] = True
+        # Initialize a boolean mask over all traces (shape: (num_traces,))
+        mask = np.ones((self.num_traces,), dtype=bool)
+
+        for dim_idx, sl in enumerate(chunk_slices):
+            arr = self.header_index_arrays[dim_idx]  # shape: (num_traces,)
+            start, stop = sl.start, sl.stop
+            if start is not None:
+                mask &= (arr >= start)
+            if stop is not None:
+                mask &= (arr < stop)
+            if not mask.any():
+                # No traces remain after this dimension's filtering
+                return np.empty((0,), dtype=np.uint32)
+
+        # Gather the trace IDs that survived all dimension tests
+        trace_ids = np.nonzero(mask)[0].astype(np.uint32)
+        return trace_ids
 
 
 class GridSerializer(Serializer):
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -77,45 +77,53 @@ def trace_worker(
     Returns:
         Partial statistics for chunk, or None
     """
-    # Special case where there are no traces inside chunk.
-    live_subset = grid.live_mask[chunk_indices[:-1]]
-
-    if np.count_nonzero(live_subset) == 0:
+    from time import time
+    start_time = time()
+    # Determine which trace IDs fall into this chunk
+    trace_ids = grid.get_traces_for_chunk(chunk_indices[:-1])
+    if trace_ids.size == 0:
         return None
 
-    # Let's get trace numbers from grid map using the chunk indices.
-    seq_trace_indices = grid.map[chunk_indices[:-1]]
-
-    tmp_data = np.zeros(seq_trace_indices.shape + (grid.shape[-1],), dtype=data_array.dtype)
-    tmp_metadata = np.zeros(seq_trace_indices.shape, dtype=metadata_array.dtype)
-
-    del grid  # To save some memory
-
-    # Read headers and traces for block
-    valid_indices = seq_trace_indices[live_subset]
-
-    traces = segy_file.trace[valid_indices.tolist()]
+    # Read headers and traces for the selected trace IDs
+    traces = segy_file.trace[trace_ids.tolist()]
     headers, samples = traces["header"], traces["data"]
 
-    tmp_metadata[live_subset] = headers.view(tmp_metadata.dtype)
-    tmp_data[live_subset] = samples
-
-    # Flush metadata to zarr
+    # Build a temporary buffer for data and metadata for this chunk
+    chunk_shape = tuple(sli.stop - sli.start for sli in chunk_indices[:-1]) + (grid.shape[-1],)
+    tmp_data = np.zeros(chunk_shape, dtype=data_array.dtype)
+    meta_shape = tuple(sli.stop - sli.start for sli in chunk_indices[:-1])
+    tmp_metadata = np.zeros(meta_shape, dtype=metadata_array.dtype)
+
+    # Compute local coordinates within the chunk for each trace
+    local_coords: list[np.ndarray] = []
+    for dim_idx, sl in enumerate(chunk_indices[:-1]):
+        hdr_arr = grid.header_index_arrays[dim_idx]
+        local_idx = (hdr_arr[trace_ids] - sl.start).astype(int)
+        local_coords.append(local_idx)
+    full_idx = tuple(local_coords) + (slice(None),)
+
+    # Populate the temporary buffers
+    tmp_data[full_idx] = samples
+    tmp_metadata[tuple(local_coords)] = headers.view(tmp_metadata.dtype)
+
+    # Flush metadata to Zarr
     metadata_array.set_basic_selection(selection=chunk_indices[:-1], value=tmp_metadata)
 
+    # Determine nonzero samples and early-exit if none
     nonzero_mask = samples != 0
-    nonzero_count = nonzero_mask.sum(dtype="uint32")
-
+    nonzero_count = int(nonzero_mask.sum())
     if nonzero_count == 0:
         return None
 
+    # Flush data to Zarr
     data_array.set_basic_selection(selection=chunk_indices, value=tmp_data)
 
     # Calculate statistics
-    tmp_data = samples[nonzero_mask]
-    chunk_sum = tmp_data.sum(dtype="float64")
-    chunk_sum_squares = np.square(tmp_data, dtype="float64").sum()
-    min_val = tmp_data.min()
-    max_val = tmp_data.max()
-
-    return nonzero_count, chunk_sum, chunk_sum_squares, min_val, max_val
+    flattened_nonzero = samples[nonzero_mask]
+    chunk_sum = float(flattened_nonzero.sum(dtype="float64"))
+    chunk_sum_squares = float(np.square(flattened_nonzero, dtype="float64").sum())
+    min_val = float(flattened_nonzero.min())
+    max_val = float(flattened_nonzero.max())
+
+    print(f"Time taken: {time() - start_time} seconds")
+    return (nonzero_count, chunk_sum, chunk_sum_squares, min_val, max_val)
diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py