Eager memory allocation fix (TGSAI#609)

BrianMichell · tasansal · web-flow · commit 0ceccc1a7a5d · 2025-08-25T17:59:52.000-05:00
* Implement fixes to ensure lazy allocation of data arrays on serialization

* Avoid unnecessary copies of data in memory

* Linting

* Eliminate immediate overwrite of `data` bug

* Remove unused import

* Set appropriate fill value for lazy arrays

* Clean up header value handler

* Resolve data serialization issues

* Ensure all encodings are captured

* Simplify dataset coordinate population logic by removing unused imports and redundant variable handling

* Refactor `_workers.py` to streamline variable handling, replace manual Variable creation with direct assignment, and resolve redundant imports.

* make better use of grid

* fix type hint

* make better use of grid

* fix(regression): make dataset serialization less eager

* update zarr

* remove comment

---------

Co-authored-by: Altay Sansal &lt;tasansal@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "segy (>=0.4.2,<0.5.0)",
     "tqdm (>=4.67.1,<5.0.0)",
     "xarray>=2025.7.1",
-    "zarr (>=3.1.1,<4.0.0)",
+    "zarr (>=3.1.2,<4.0.0)",
 ]
 
 [project.optional-dependencies]
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -163,7 +163,7 @@ def _build_and_check_grid(segy_dimensions: list[Dimension], segy_file: SegyFile,
 
 
 def _get_coordinates(
-    segy_dimensions: list[Dimension],
+    grid: Grid,
     segy_headers: SegyHeaderArray,
     mdio_template: AbstractDatasetTemplate,
 ) -> tuple[list[Dimension], dict[str, SegyHeaderArray]]:
@@ -174,7 +174,7 @@ def _get_coordinates(
     The last dimension is always the vertical domain dimension
 
     Args:
-        segy_dimensions: List of of all SEG-Y dimensions.
+        grid: Inferred MDIO grid for SEG-Y file.
         segy_headers: Headers read in from SEG-Y file.
         mdio_template: The MDIO template to use for the conversion.
 
@@ -188,19 +188,15 @@ def _get_coordinates(
             - A dict of non-dimension coordinates (str: N-D arrays).
     """
     dimensions_coords = []
-    dim_names = [dim.name for dim in segy_dimensions]
     for dim_name in mdio_template.dimension_names:
-        try:
-            dim_index = dim_names.index(dim_name)
-        except ValueError:
+        if dim_name not in grid.dim_names:
             err = f"Dimension '{dim_name}' was not found in SEG-Y dimensions."
-            raise ValueError(err) from err
-        dimensions_coords.append(segy_dimensions[dim_index])
+            raise ValueError(err)
+        dimensions_coords.append(grid.select_dim(dim_name))
 
     non_dim_coords: dict[str, SegyHeaderArray] = {}
-    available_headers = segy_headers.dtype.names
     for coord_name in mdio_template.coordinate_names:
-        if coord_name not in available_headers:
+        if coord_name not in segy_headers.dtype.names:
             err = f"Coordinate '{coord_name}' not found in SEG-Y dimensions."
             raise ValueError(err)
         non_dim_coords[coord_name] = segy_headers[coord_name]
@@ -227,12 +223,14 @@ def populate_non_dim_coordinates(
     """Populate the xarray dataset with coordinate variables."""
     not_null = grid.map[:] != UINT32_MAX
     for c_name, c_values in coordinates.items():
-        dataset[c_name].values[not_null] = c_values
+        c_tmp_array = dataset[c_name].values
+        c_tmp_array[not_null] = c_values
+        dataset[c_name][:] = c_tmp_array
         drop_vars_delayed.append(c_name)
     return dataset, drop_vars_delayed
 
 
-def _get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> LengthUnitEnum | None:
+def _get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> AllUnits | None:
     """Get the coordinate unit from the SEG-Y headers."""
     name = TraceHeaderFieldsRev0.COORDINATE_UNIT.name.upper()
     unit_hdr = next((c for c in segy_headers if c.name.upper() == name), None)
@@ -347,15 +345,17 @@ def segy_to_mdio(
 
     grid = _build_and_check_grid(segy_dimensions, segy_file, segy_headers)
 
-    dimensions, non_dim_coords = _get_coordinates(segy_dimensions, segy_headers, mdio_template)
-    shape = [len(dim.coords) for dim in dimensions]
+    dimensions, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
     # TODO(Altay): Turn this dtype into packed representation
     # https://github.com/TGSAI/mdio-python/issues/601
     headers = to_structured_type(segy_spec.trace.header.dtype)
 
     horizontal_unit = _get_horizontal_coordinate_unit(segy_dimensions)
     mdio_ds: Dataset = mdio_template.build_dataset(
-        name=mdio_template.name, sizes=shape, horizontal_coord_unit=horizontal_unit, headers=headers
+        name=mdio_template.name,
+        sizes=grid.shape,
+        horizontal_coord_unit=horizontal_unit,
+        headers=headers,
     )
 
     _add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file)
@@ -376,18 +376,12 @@ def segy_to_mdio(
     # IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in
     # blocked_io.to_zarr() -> _workers.trace_worker()
 
-    # Write the xarray dataset to Zarr with as following:
-    # Populated arrays:
-    # - 1D dimensional coordinates
-    # - ND non-dimensional coordinates
-    # - ND trace_mask
-    # Empty arrays (will be populated later in chunks):
-    # - ND+1 traces
-    # - ND headers (no _FillValue set due to the bug https://github.com/TGSAI/mdio-python/issues/582)
-    # This will create the Zarr store with the correct structure
-    # TODO(Dmitriy Repin): do chunked write for non-dimensional coordinates and trace_mask
-    # https://github.com/TGSAI/mdio-python/issues/587
-    xr_dataset.to_zarr(store=output_location.uri, mode="w", write_empty_chunks=False, zarr_format=2, compute=True)
+    # This will create the Zarr store with the correct structure but with empty arrays
+    xr_dataset.to_zarr(store=output_location.uri, mode="w", write_empty_chunks=False, zarr_format=2, compute=False)
+
+    # This will write the non-dimension coordinates and trace mask
+    meta_ds = xr_dataset[drop_vars_delayed + ["trace_mask"]]
+    meta_ds.to_zarr(store=output_location.uri, mode="r+", write_empty_chunks=False, zarr_format=2, compute=True)
 
     # Now we can drop them to simplify chunked write of the data variable
     xr_dataset = xr_dataset.drop_vars(drop_vars_delayed)
diff --git a/src/mdio/schemas/v1/dataset_serializer.py b/src/mdio/schemas/v1/dataset_serializer.py
@@ -1,10 +1,10 @@
 """Convert MDIO v1 schema Dataset to Xarray DataSet and write it in Zarr."""
 
 import numpy as np
+from dask import array as dask_array
 from numcodecs import Blosc as nc_Blosc
 from xarray import DataArray as xr_DataArray
 from xarray import Dataset as xr_Dataset
-from zarr import zeros as zarr_zeros
 from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding
 
 from mdio.converters.type_converter import to_numpy_dtype
@@ -177,8 +177,8 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset:  # noqa: PLR0912
         mdio_ds: The source MDIO dataset to construct from.
 
     Notes:
-        - We can't use Dask (e.g., dask_array.zeros) because of the problems with
-          structured type support. We will uze zarr.zeros instead
+        - Using dask.array.zeros for lazy evaluation to prevent eager memory allocation
+          while maintaining support for structured dtypes
 
     Returns:
         The constructed dataset with proper MDIO structure and metadata.
@@ -195,9 +195,8 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset:  # noqa: PLR0912
         dtype = to_numpy_dtype(v.data_type)
         chunks = _get_zarr_chunks(v, all_named_dims=all_named_dims)
 
-        # Use zarr.zeros to create an empty array with the specified shape and dtype
-        # NOTE: zarr_format=2 is essential, to_zarr() will fail if zarr_format=2 is used
-        data = zarr_zeros(shape=shape, dtype=dtype, zarr_format=2)
+        # Use dask.array.zeros to create a lazy array
+        data = dask_array.full(shape=shape, dtype=dtype, chunks=chunks, fill_value=_get_fill_value(v.data_type))
         # Create a DataArray for the variable. We will set coords in the second pass
         dim_names = _get_dimension_names(v)
         data_array = xr_DataArray(data, dims=dim_names)
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -10,6 +10,8 @@
 import numpy as np
 from segy import SegyFile
 
+from mdio.schemas import ScalarType
+
 if TYPE_CHECKING:
     from segy.arrays import HeaderArray
     from segy.config import SegySettings
@@ -19,7 +21,9 @@
 
     from mdio.core.storage_location import StorageLocation
 
+
 from mdio.constants import UINT32_MAX
+from mdio.schemas.v1.dataset_serializer import _get_fill_value
 from mdio.schemas.v1.stats import CenteredBinHistogram
 from mdio.schemas.v1.stats import SummaryStatistics
 
@@ -109,33 +113,37 @@ def trace_worker(  # noqa: PLR0913
     live_trace_indexes = grid_map[not_null].tolist()
     traces = segy_file.trace[live_trace_indexes]
 
+    header_key = "headers"
+
     # Get subset of the dataset that has not yet been saved
     # The headers might not be present in the dataset
-    # TODO(Dmitriy Repin): Check, should we overwrite the 'dataset' instead to save the memory
-    # https://github.com/TGSAI/mdio-python/issues/584
-    if "headers" in dataset.data_vars:
-        ds_to_write = dataset[[data_variable_name, "headers"]]
-        ds_to_write = ds_to_write.reset_coords()
-
-        ds_to_write["headers"].data[not_null] = traces.header
-        ds_to_write["headers"].data[~not_null] = 0
-    else:
-        ds_to_write = dataset[[data_variable_name]]
-        ds_to_write = ds_to_write.reset_coords()
+    worker_variables = [data_variable_name]
+    if header_key in dataset.data_vars:  # Keeping the `if` here to allow for more worker configurations
+        worker_variables.append(header_key)
+
+    ds_to_write = dataset[worker_variables]
+
+    if header_key in worker_variables:
+        # Create temporary array for headers with the correct shape
+        # TODO(BrianMichell): Implement this better so that we can enable fill values without changing the code. #noqa: TD003
+        tmp_headers = np.zeros_like(dataset[header_key])
+        tmp_headers[not_null] = traces.header
+        ds_to_write[header_key][:] = tmp_headers
 
-    ds_to_write[data_variable_name].data[not_null] = traces.sample
+    data_variable = ds_to_write[data_variable_name]
+    fill_value = _get_fill_value(ScalarType(data_variable.dtype.name))
+    tmp_samples = np.full_like(data_variable, fill_value=fill_value)
+    tmp_samples[not_null] = traces.sample
+    ds_to_write[data_variable_name][:] = tmp_samples
 
-    out_path = output_location.uri
-    ds_to_write.to_zarr(out_path, region=region, mode="r+", write_empty_chunks=False, zarr_format=2)
+    ds_to_write.to_zarr(output_location.uri, region=region, mode="r+", write_empty_chunks=False, zarr_format=2)
 
     histogram = CenteredBinHistogram(bin_centers=[], counts=[])
     return SummaryStatistics(
         count=traces.sample.size,
         min=traces.sample.min(),
         max=traces.sample.max(),
         sum=traces.sample.sum(),
-        # TODO(Altay): Look at how to do the sum squares statistic correctly
-        # https://github.com/TGSAI/mdio-python/issues/581
         sum_squares=(traces.sample**2).sum(),
         histogram=histogram,
     )
diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py
@@ -98,7 +98,6 @@ def to_zarr(  # noqa: PLR0913, PLR0915
     num_workers = min(num_chunks, num_cpus)
     context = mp.get_context("spawn")
     executor = ProcessPoolExecutor(max_workers=num_workers, mp_context=context)
-    # return executor
 
     segy_kw = {
         "url": segy_file.fs.unstrip_protocol(segy_file.url),
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ dependencies = [`
`29`	`29`	`"segy (>=0.4.2,<0.5.0)",`
`30`	`30`	`"tqdm (>=4.67.1,<5.0.0)",`
`31`	`31`	`"xarray>=2025.7.1",`
`32`		`- "zarr (>=3.1.1,<4.0.0)",`
	`32`	`+ "zarr (>=3.1.2,<4.0.0)",`
`33`	`33`	`]`
`34`	`34`
`35`	`35`	`[project.optional-dependencies]`