From 7bf60c0562e3c2b2207dafef82d8ef633c0c7f81 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 9 Aug 2025 18:35:20 -0400 Subject: [PATCH 1/5] Include serializer from ManifestStore in Icechunk virtual references --- virtualizarr/manifests/store.py | 4 +++- virtualizarr/writers/icechunk.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 0076ce2c..634e749b 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -179,12 +179,14 @@ async def get( key, marr.metadata.chunk_key_encoding.separator ) + if manifest._paths.shape == (1,) and chunk_indexes == (): + # TODO: Kerchunk parsers (e.g., NetCDF3) are returning 1 dimensional arrays for scalar arrays, this should be addressed at the source of the issue rather than as a workaround + chunk_indexes = (0,) path = manifest._paths[chunk_indexes] if path == "": return None offset = manifest._offsets[chunk_indexes] length = manifest._lengths[chunk_indexes] - # Get the configured object store instance that matches the path store, path_after_prefix = self._registry.resolve(path) if not store: diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index d2e11918..219ec1f9 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -339,7 +339,7 @@ def write_virtual_variable_to_icechunk( else: append_axis = None # TODO: Should codecs be an argument to zarr's AsyncrGroup.create_array? - filters, _, compressors = extract_codecs(metadata.codecs) + filters, serializer, compressors = extract_codecs(metadata.codecs) arr = group.require_array( name=name, shape=metadata.shape, @@ -347,6 +347,7 @@ def write_virtual_variable_to_icechunk( dtype=metadata.data_type.to_native_dtype(), filters=filters, compressors=compressors, + serializer=serializer, dimension_names=var.dims, fill_value=metadata.fill_value, ) From 2433372006a385cb6cd4a34f407c2643f46c6232 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 9 Aug 2025 18:48:48 -0400 Subject: [PATCH 2/5] Add a test --- .../tests/test_writers/test_icechunk.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index 2e2cb9fc..0521c90d 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -5,12 +5,15 @@ import numpy as np import numpy.testing as npt +import obstore import pytest import xarray as xr import xarray.testing as xrt import zarr +from zarr.codecs import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.metadata import ArrayV3Metadata +from zarr.dtype import parse_data_type from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests.utils import PYTEST_TMP_DIRECTORY_URL_PREFIX @@ -57,6 +60,48 @@ def icechunk_filestore(icechunk_repo: "Repository") -> "IcechunkStore": return session.store +@pytest.fixture() +def big_endian_synthetic_vds(tmpdir: Path): + filepath = f"{tmpdir}/data_chunk" + store = obstore.store.LocalStore() + arr = np.array([1, 2, 3, 4, 5, 6], dtype=">i4").reshape(3, 2) + shape = arr.shape + dtype = arr.dtype + buf = arr.tobytes() + obstore.put( + store, + filepath, + buf, + ) + manifest = ChunkManifest( + {"0.0": {"path": filepath, "offset": 0, "length": len(buf)}} + ) + zdtype = parse_data_type(dtype, zarr_format=3) + metadata = ArrayV3Metadata( + shape=shape, + data_type=zdtype, + chunk_grid={ + "name": "regular", + "configuration": {"chunk_shape": shape}, + }, + chunk_key_encoding={"name": "default"}, + fill_value=zdtype.default_scalar(), + codecs=[BytesCodec(endian="big")], + attributes={}, + dimension_names=("y", "x"), + storage_transformers=None, + ) + ma = ManifestArray( + chunkmanifest=manifest, + metadata=metadata, + ) + foo = xr.Variable(data=ma, dims=["y", "x"], encoding={"scale_factor": 2}) + vds = xr.Dataset( + {"foo": foo}, + ) + return vds, arr + + @pytest.mark.parametrize("kwarg", [("group", {}), ("append_dim", {})]) def test_invalid_kwarg_type( icechunk_filestore: "IcechunkStore", @@ -287,6 +332,20 @@ def test_set_grid_virtual_refs( ) +def test_write_big_endian_value(icechunk_repo: "Repository", big_endian_synthetic_vds): + vds, arr = big_endian_synthetic_vds + vds = vds.drop_encoding() + # Commit the first virtual dataset + writable_session = icechunk_repo.writable_session("main") + vds.vz.to_icechunk(writable_session.store) + writable_session.commit("test commit") + read_session = icechunk_repo.readonly_session(branch="main") + with ( + xr.open_zarr(read_session.store, consolidated=False, zarr_format=3) as ds, + ): + np.testing.assert_equal(ds["foo"].data, arr) + + def test_write_loadable_variable( icechunk_filestore: "IcechunkStore", simple_netcdf4: Path, From e3b7c2940a56f63eaf9c87f5fdad661e81468d28 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 12 Aug 2025 16:39:44 -0400 Subject: [PATCH 3/5] Remove unrelated fix --- virtualizarr/manifests/store.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 634e749b..7d774dd3 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -179,9 +179,6 @@ async def get( key, marr.metadata.chunk_key_encoding.separator ) - if manifest._paths.shape == (1,) and chunk_indexes == (): - # TODO: Kerchunk parsers (e.g., NetCDF3) are returning 1 dimensional arrays for scalar arrays, this should be addressed at the source of the issue rather than as a workaround - chunk_indexes = (0,) path = manifest._paths[chunk_indexes] if path == "": return None From a19c8830e2c28997aa5a9887f5158c236683ee4b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 21:39:43 +0100 Subject: [PATCH 4/5] release note --- docs/releases.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/releases.md b/docs/releases.md index d0c33624..fdcaf906 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -8,6 +8,8 @@ ### Bug fixes +- Fix handling of big-endian data in Icechunk by making sure that non-default zarr serializers are included in the zarr array metadata [#766](https://github.com/zarr-developers/VirtualiZarr/issues/766)). By [Max Jones](https://github.com/maxrjones) + ### Documentation ### Internal changes From 38873e8a7058361a2da94b1991b84abfcfdf5865 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 21:41:46 +0100 Subject: [PATCH 5/5] remove trailing ) --- docs/releases.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/releases.md b/docs/releases.md index fdcaf906..7b7e3f38 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -8,7 +8,7 @@ ### Bug fixes -- Fix handling of big-endian data in Icechunk by making sure that non-default zarr serializers are included in the zarr array metadata [#766](https://github.com/zarr-developers/VirtualiZarr/issues/766)). By [Max Jones](https://github.com/maxrjones) +- Fix handling of big-endian data in Icechunk by making sure that non-default zarr serializers are included in the zarr array metadata [#766](https://github.com/zarr-developers/VirtualiZarr/issues/766). By [Max Jones](https://github.com/maxrjones) ### Documentation