11"""Convert MDIO v1 schema Dataset to Xarray DataSet and write it in Zarr."""
22
33import numpy as np
4+ from dask import array as dask_array
5+ from zarr import zeros as zarr_zeros
46from numcodecs import Blosc as nc_Blosc
57from xarray import DataArray as xr_DataArray
68from xarray import Dataset as xr_Dataset
7- from zarr import zeros as zarr_zeros
89from zarr .core .chunk_key_encodings import V2ChunkKeyEncoding
910
1011from mdio .converters .type_converter import to_numpy_dtype
@@ -177,8 +178,8 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset: # noqa: PLR0912
177178 mdio_ds: The source MDIO dataset to construct from.
178179
179180 Notes:
180- - We can't use Dask (e.g., dask_array. zeros) because of the problems with
181- structured type support. We will uze zarr.zeros instead
181+ - Using dask.array. zeros for lazy evaluation to prevent eager memory allocation
182+ while maintaining support for structured dtypes
182183
183184 Returns:
184185 The constructed dataset with proper MDIO structure and metadata.
@@ -195,9 +196,14 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset: # noqa: PLR0912
195196 dtype = to_numpy_dtype (v .data_type )
196197 chunks = _get_zarr_chunks (v , all_named_dims = all_named_dims )
197198
198- # Use zarr.zeros to create an empty array with the specified shape and dtype
199- # NOTE: zarr_format=2 is essential, to_zarr() will fail if zarr_format=2 is used
200- data = zarr_zeros (shape = shape , dtype = dtype , zarr_format = 2 )
199+ if hasattr (dtype , "fields" ):
200+ data = zarr_zeros (shape = shape , dtype = dtype , zarr_format = 2 )
201+ else :
202+ data = dask_array .zeros (shape = shape , dtype = dtype , chunks = chunks )
203+
204+ # Use dask.array.zeros to create a lazy array with the specified shape and dtype
205+ # This prevents eager memory allocation while maintaining support for structured dtypes
206+ data = dask_array .zeros (shape = shape , dtype = dtype , chunks = chunks )
201207 # Create a DataArray for the variable. We will set coords in the second pass
202208 dim_names = _get_dimension_names (v )
203209 data_array = xr_DataArray (data , dims = dim_names )
0 commit comments