diff --git a/docs/releases.md b/docs/releases.md index d0c33624..7b7e3f38 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -8,6 +8,8 @@ ### Bug fixes +- Fix handling of big-endian data in Icechunk by making sure that non-default zarr serializers are included in the zarr array metadata [#766](https://github.com/zarr-developers/VirtualiZarr/issues/766). By [Max Jones](https://github.com/maxrjones) + ### Documentation ### Internal changes diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 0076ce2c..7d774dd3 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -184,7 +184,6 @@ async def get( return None offset = manifest._offsets[chunk_indexes] length = manifest._lengths[chunk_indexes] - # Get the configured object store instance that matches the path store, path_after_prefix = self._registry.resolve(path) if not store: diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index 2e2cb9fc..0521c90d 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -5,12 +5,15 @@ import numpy as np import numpy.testing as npt +import obstore import pytest import xarray as xr import xarray.testing as xrt import zarr +from zarr.codecs import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.metadata import ArrayV3Metadata +from zarr.dtype import parse_data_type from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests.utils import PYTEST_TMP_DIRECTORY_URL_PREFIX @@ -57,6 +60,48 @@ def icechunk_filestore(icechunk_repo: "Repository") -> "IcechunkStore": return session.store +@pytest.fixture() +def big_endian_synthetic_vds(tmpdir: Path): + filepath = f"{tmpdir}/data_chunk" + store = obstore.store.LocalStore() + arr = np.array([1, 2, 3, 4, 5, 6], dtype=">i4").reshape(3, 2) + shape = arr.shape + dtype = arr.dtype + buf = arr.tobytes() + obstore.put( + store, + filepath, + buf, + ) + manifest = ChunkManifest( + {"0.0": {"path": filepath, "offset": 0, "length": len(buf)}} + ) + zdtype = parse_data_type(dtype, zarr_format=3) + metadata = ArrayV3Metadata( + shape=shape, + data_type=zdtype, + chunk_grid={ + "name": "regular", + "configuration": {"chunk_shape": shape}, + }, + chunk_key_encoding={"name": "default"}, + fill_value=zdtype.default_scalar(), + codecs=[BytesCodec(endian="big")], + attributes={}, + dimension_names=("y", "x"), + storage_transformers=None, + ) + ma = ManifestArray( + chunkmanifest=manifest, + metadata=metadata, + ) + foo = xr.Variable(data=ma, dims=["y", "x"], encoding={"scale_factor": 2}) + vds = xr.Dataset( + {"foo": foo}, + ) + return vds, arr + + @pytest.mark.parametrize("kwarg", [("group", {}), ("append_dim", {})]) def test_invalid_kwarg_type( icechunk_filestore: "IcechunkStore", @@ -287,6 +332,20 @@ def test_set_grid_virtual_refs( ) +def test_write_big_endian_value(icechunk_repo: "Repository", big_endian_synthetic_vds): + vds, arr = big_endian_synthetic_vds + vds = vds.drop_encoding() + # Commit the first virtual dataset + writable_session = icechunk_repo.writable_session("main") + vds.vz.to_icechunk(writable_session.store) + writable_session.commit("test commit") + read_session = icechunk_repo.readonly_session(branch="main") + with ( + xr.open_zarr(read_session.store, consolidated=False, zarr_format=3) as ds, + ): + np.testing.assert_equal(ds["foo"].data, arr) + + def test_write_loadable_variable( icechunk_filestore: "IcechunkStore", simple_netcdf4: Path, diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index d2e11918..219ec1f9 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -339,7 +339,7 @@ def write_virtual_variable_to_icechunk( else: append_axis = None # TODO: Should codecs be an argument to zarr's AsyncrGroup.create_array? - filters, _, compressors = extract_codecs(metadata.codecs) + filters, serializer, compressors = extract_codecs(metadata.codecs) arr = group.require_array( name=name, shape=metadata.shape, @@ -347,6 +347,7 @@ def write_virtual_variable_to_icechunk( dtype=metadata.data_type.to_native_dtype(), filters=filters, compressors=compressors, + serializer=serializer, dimension_names=var.dims, fill_value=metadata.fill_value, )