Use native zarr for trace worker instead of xarray for memory efficiency. (TGSAI#710)

BrianMichell · tasansal · web-flow · commit bdebc7e88c2e · 2025-10-13T17:08:41.000-05:00
* Ingestion with zarr instead of xarray

* Optimizations

* Pre-commit

* Flag patch version change for debugging and testing

* Use sensible variable names

* revert version change

this will trigger CI/CD to make a new release, better not done here, but must be done from `main`

* remove unnecessary I/O calls by reading existing values

---------

Co-authored-by: Altay Sansal &lt;tasansal@users.noreply.github.com&gt;
Co-authored-by: Altay Sansal &lt;13684161+tasansal@users.noreply.github.com&gt;
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -12,8 +12,7 @@
 from segy import SegyFile
 from segy.arrays import HeaderArray
 
-from mdio.api.io import to_mdio
-from mdio.builder.schemas.dtype import ScalarType
+from mdio.api.io import _normalize_storage_options
 from mdio.segy._raw_trace_wrapper import SegyFileRawTraceWrapper
 from mdio.segy.scalar import _get_coordinate_scalar
 
@@ -22,15 +21,13 @@
     from segy.config import SegyHeaderOverrides
     from segy.schema import SegySpec
     from upath import UPath
-    from xarray import Dataset as xr_Dataset
     from zarr import Array as zarr_Array
 
-from xarray import Variable
+from zarr import open_group as zarr_open_group
 from zarr.core.config import config as zarr_config
 
 from mdio.builder.schemas.v1.stats import CenteredBinHistogram
 from mdio.builder.schemas.v1.stats import SummaryStatistics
-from mdio.builder.xarray_builder import _get_fill_value
 from mdio.constants import fill_value_map
 
 if TYPE_CHECKING:
@@ -100,7 +97,6 @@ def trace_worker(  # noqa: PLR0913
     data_variable_name: str,
     region: dict[str, slice],
     grid_map: zarr_Array,
-    dataset: xr_Dataset,
 ) -> SummaryStatistics | None:
     """Writes a subset of traces from a region of the dataset of Zarr file.
 
@@ -112,7 +108,6 @@ def trace_worker(  # noqa: PLR0913
         data_variable_name: Name of the data variable to write.
         region: Region of the dataset to write to.
         grid_map: Zarr array mapping live traces to their positions in the dataset.
-        dataset: Xarray dataset containing the data to write.
 
     Returns:
         SummaryStatistics object containing statistics about the written traces.
@@ -135,16 +130,15 @@ def trace_worker(  # noqa: PLR0913
 
     live_trace_indexes = local_grid_map[not_null].tolist()
 
+    # Open the zarr group to write directly
+    storage_options = _normalize_storage_options(output_path)
+    zarr_group = zarr_open_group(output_path.as_posix(), mode="r+", storage_options=storage_options)
+
     header_key = "headers"
     raw_header_key = "raw_headers"
 
-    # Get subset of the dataset that has not yet been saved
-    # The headers might not be present in the dataset
-    worker_variables = [data_variable_name]
-    if header_key in dataset.data_vars:  # Keeping the `if` here to allow for more worker configurations
-        worker_variables.append(header_key)
-    if raw_header_key in dataset.data_vars:
-        worker_variables.append(raw_header_key)
+    # Check which variables exist in the zarr store
+    available_arrays = list(zarr_group.array_keys())
 
     # traces = segy_file.trace[live_trace_indexes]
     # Raw headers are not intended to remain as a feature of the SEGY ingestion.
@@ -153,51 +147,33 @@ def trace_worker(  # noqa: PLR0913
     # NOTE: The `raw_header_key` code block should be removed in full as it will become dead code.
     traces = SegyFileRawTraceWrapper(segy_file, live_trace_indexes)
 
-    ds_to_write = dataset[worker_variables]
+    # Compute slices once (headers exclude sample dimension)
+    header_region_slices = region_slices[:-1]  # Exclude sample dimension
+
+    full_shape = tuple(s.stop - s.start for s in region_slices)
+    header_shape = tuple(s.stop - s.start for s in header_region_slices)
 
-    if raw_header_key in worker_variables:
-        tmp_raw_headers = np.zeros_like(dataset[raw_header_key])
+    # Write raw headers if they exist
+    # Headers only have spatial dimensions (no sample dimension)
+    if raw_header_key in available_arrays:
+        raw_header_array = zarr_group[raw_header_key]
+        tmp_raw_headers = np.full(header_shape, raw_header_array.fill_value)
         tmp_raw_headers[not_null] = traces.raw_header
+        raw_header_array[header_region_slices] = tmp_raw_headers
 
-        ds_to_write[raw_header_key] = Variable(
-            ds_to_write[raw_header_key].dims,
-            tmp_raw_headers,
-            attrs=ds_to_write[raw_header_key].attrs,
-            encoding=ds_to_write[raw_header_key].encoding,  # Not strictly necessary, but safer than not doing it.
-        )
-
-    if header_key in worker_variables:
-        # TODO(BrianMichell): Implement this better so that we can enable fill values without changing the code
-        # https://github.com/TGSAI/mdio-python/issues/584
-        tmp_headers = np.zeros_like(dataset[header_key])
+    # Write headers if they exist
+    # Headers only have spatial dimensions (no sample dimension)
+    if header_key in available_arrays:
+        header_array = zarr_group[header_key]
+        tmp_headers = np.full(header_shape, header_array.fill_value)
         tmp_headers[not_null] = traces.header
-        # Create a new Variable object to avoid copying the temporary array
-        # The ideal solution is to use `ds_to_write[header_key][:] = tmp_headers`
-        # but Xarray appears to be copying memory instead of doing direct assignment.
-        # TODO(BrianMichell): #614 Look into this further.
-        # https://github.com/TGSAI/mdio-python/issues/584
-        ds_to_write[header_key] = Variable(
-            ds_to_write[header_key].dims,
-            tmp_headers,
-            attrs=ds_to_write[header_key].attrs,
-            encoding=ds_to_write[header_key].encoding,  # Not strictly necessary, but safer than not doing it.
-        )
-
-    data_variable = ds_to_write[data_variable_name]
-    fill_value = _get_fill_value(ScalarType(data_variable.dtype.name))
-    tmp_samples = np.full_like(data_variable, fill_value=fill_value)
-    tmp_samples[not_null] = traces.sample
+        header_array[header_region_slices] = tmp_headers
 
-    # TODO(BrianMichell): #614 Look into this further.
-    # https://github.com/TGSAI/mdio-python/issues/584
-    ds_to_write[data_variable_name] = Variable(
-        ds_to_write[data_variable_name].dims,
-        tmp_samples,
-        attrs=ds_to_write[data_variable_name].attrs,
-        encoding=ds_to_write[data_variable_name].encoding,  # Not strictly necessary, but safer than not doing it.
-    )
-
-    to_mdio(ds_to_write, output_path=output_path, region=region, mode="r+")
+    # Write the data variable
+    data_array = zarr_group[data_variable_name]
+    tmp_samples = np.full(full_shape, data_array.fill_value)
+    tmp_samples[not_null] = traces.sample
+    data_array[region_slices] = tmp_samples
 
     nonzero_samples = np.ma.masked_values(traces.sample, 0, copy=False)
     histogram = CenteredBinHistogram(bin_centers=[], counts=[])
diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py
@@ -92,7 +92,7 @@ def to_zarr(  # noqa: PLR0913, PLR0915
         futures = []
         common_args = (segy_file_kwargs, output_path, data_variable_name)
         for region in chunk_iter:
-            subset_args = (region, grid_map, dataset.isel(region))
+            subset_args = (region, grid_map)
             future = executor.submit(trace_worker, *common_args, *subset_args)
             futures.append(future)