Implement fixes to ensure lazy allocation of data arrays on serialization

BrianMichell · BrianMichell · commit a19c6b9d96f1 · 2025-08-19T16:26:55.000Z
diff --git a/src/mdio/schemas/v1/dataset_serializer.py b/src/mdio/schemas/v1/dataset_serializer.py
@@ -1,10 +1,11 @@
 """Convert MDIO v1 schema Dataset to Xarray DataSet and write it in Zarr."""
 
 import numpy as np
+from dask import array as dask_array
+from zarr import zeros as zarr_zeros
 from numcodecs import Blosc as nc_Blosc
 from xarray import DataArray as xr_DataArray
 from xarray import Dataset as xr_Dataset
-from zarr import zeros as zarr_zeros
 from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding
 
 from mdio.converters.type_converter import to_numpy_dtype
@@ -177,8 +178,8 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset:  # noqa: PLR0912
         mdio_ds: The source MDIO dataset to construct from.
 
     Notes:
-        - We can't use Dask (e.g., dask_array.zeros) because of the problems with
-          structured type support. We will uze zarr.zeros instead
+        - Using dask.array.zeros for lazy evaluation to prevent eager memory allocation
+          while maintaining support for structured dtypes
 
     Returns:
         The constructed dataset with proper MDIO structure and metadata.
@@ -195,9 +196,14 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset:  # noqa: PLR0912
         dtype = to_numpy_dtype(v.data_type)
         chunks = _get_zarr_chunks(v, all_named_dims=all_named_dims)
 
-        # Use zarr.zeros to create an empty array with the specified shape and dtype
-        # NOTE: zarr_format=2 is essential, to_zarr() will fail if zarr_format=2 is used
-        data = zarr_zeros(shape=shape, dtype=dtype, zarr_format=2)
+        if hasattr(dtype, "fields"):
+            data = zarr_zeros(shape=shape, dtype=dtype, zarr_format=2)
+        else:
+            data = dask_array.zeros(shape=shape, dtype=dtype, chunks=chunks)
+
+        # Use dask.array.zeros to create a lazy array with the specified shape and dtype
+        # This prevents eager memory allocation while maintaining support for structured dtypes
+        data = dask_array.zeros(shape=shape, dtype=dtype, chunks=chunks)
         # Create a DataArray for the variable. We will set coords in the second pass
         dim_names = _get_dimension_names(v)
         data_array = xr_DataArray(data, dims=dim_names)
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -117,13 +117,40 @@ def trace_worker(  # noqa: PLR0913
         ds_to_write = dataset[[data_variable_name, "headers"]]
         ds_to_write = ds_to_write.reset_coords()
 
-        ds_to_write["headers"].data[not_null] = traces.header
-        ds_to_write["headers"].data[~not_null] = 0
+        try:
+            # Create temporary array for headers with the correct shape
+            tmp_headers = np.zeros(not_null.shape, dtype=ds_to_write["headers"].dtype)
+            tmp_headers[not_null] = traces.header
+            ds_to_write["headers"].data[:] = tmp_headers
+        except Exception as e:
+            print(f"Error writing headers: {e}")
+            print(f"not_null.shape: {not_null.shape}")
+            print(f"traces.header.shape: {traces.header.shape}")
+            print(f"ds_to_write['headers'].data.shape: {ds_to_write['headers'].data.shape}")
+            raise e
+
     else:
         ds_to_write = dataset[[data_variable_name]]
         ds_to_write = ds_to_write.reset_coords()
 
-    ds_to_write[data_variable_name].data[not_null] = traces.sample
+    try:
+        # Get the sample dimension size from the data variable itself
+        sample_dim_size = ds_to_write[data_variable_name].shape[-1]
+        tmp_samples = np.zeros(not_null.shape + (sample_dim_size,), dtype=ds_to_write[data_variable_name].dtype)
+        
+        # Assign trace samples to the correct positions
+        # We need to handle the fact that traces.sample is (num_traces, num_samples)
+        # and we want to put it into positions where not_null is True
+        tmp_samples[not_null] = traces.sample
+        ds_to_write[data_variable_name].data[:] = tmp_samples
+    except Exception as e:
+        print(f"Error writing samples: {e}")
+        print(f"not_null.shape: {not_null.shape}")
+        print(f"traces.sample.shape: {traces.sample.shape}")
+        print(f"ds_to_write[data_variable_name].data.shape: {ds_to_write[data_variable_name].data.shape}")
+        print(f"not_null.sum(): {not_null.sum()}")
+        print(f"len(traces.sample): {len(traces.sample)}")
+        raise e
 
     out_path = output_location.uri
     ds_to_write.to_zarr(out_path, region=region, mode="r+", write_empty_chunks=False, zarr_format=2)