From 732f104fad58a5d6970694bc81cba091ec532b85 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Jun 2025 16:20:12 -0400 Subject: [PATCH 1/2] Support big-endian HDF5 and NetCDF3 files --- virtualizarr/codecs.py | 7 ++-- virtualizarr/manifests/utils.py | 2 +- virtualizarr/tests/test_parsers/conftest.py | 2 ++ .../tests/test_parsers/test_hdf/test_hdf.py | 19 +++++++---- .../tests/test_parsers/test_netcdf3.py | 33 +++++-------------- 5 files changed, 29 insertions(+), 34 deletions(-) diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index 164d0ae7e..343ea65fe 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -4,6 +4,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.abc.codec import Codec as ZarrCodec +from zarr.codecs import BytesCodec from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.metadata.v3 import ArrayV3Metadata @@ -64,7 +65,6 @@ def convert_to_codec_pipeline( ------- BatchedCodecPipeline """ - from zarr.core.array import _get_default_chunk_encoding_v3 from zarr.registry import get_codec_class zarr_codecs: tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...] = () @@ -78,7 +78,10 @@ def convert_to_codec_pipeline( arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs) if arraybytes_codec is None: - arraybytes_codec = _get_default_chunk_encoding_v3(dtype)[1] + if dtype.byteorder == ">": + arraybytes_codec = BytesCodec(endian="big") + else: + arraybytes_codec = BytesCodec() codec_pipeline = BatchedCodecPipeline( array_array_codecs=arrayarray_codecs, diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 7fb5ae616..95f4533a7 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -55,7 +55,7 @@ def create_v3_array_metadata( """ return ArrayV3Metadata( shape=shape, - data_type=data_type, + data_type=data_type.name if hasattr(data_type, "name") else data_type, chunk_grid={ "name": "regular", "configuration": {"chunk_shape": chunk_shape}, diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index 81e6154fa..a761f9cdf 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -457,4 +457,6 @@ def big_endian_dtype_hdf5_file(tmpdir): filepath = f"{tmpdir}/big_endian.nc" f = h5py.File(filepath, "w") f.create_dataset("data", shape=(10,), dtype=">f4") + dset = f["data"] + dset[...] = 10 return filepath diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 4a48572d7..e4f35b32c 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -1,6 +1,7 @@ import h5py # type: ignore import numpy as np import pytest +import xarray as xr from virtualizarr import open_virtual_dataset from virtualizarr.parsers import HDFParser @@ -221,19 +222,23 @@ def test_coord_names( ) as vds: assert set(vds.coords) == {"lat", "lon"} - @pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support") def test_big_endian( self, big_endian_dtype_hdf5_file, ): store = obstore_local(file_url=big_endian_dtype_hdf5_file) parser = HDFParser() - with open_virtual_dataset( - file_url=big_endian_dtype_hdf5_file, - object_store=store, - parser=parser, - ) as vds: - print(vds) + with ( + parser( + file_url=big_endian_dtype_hdf5_file, object_store=store + ) as manifest_store, + xr.open_dataset(big_endian_dtype_hdf5_file) as expected, + ): + observed = xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 + ) + assert isinstance(observed, xr.Dataset) + xr.testing.assert_identical(observed.load(), expected.load()) @requires_hdf5plugin diff --git a/virtualizarr/tests/test_parsers/test_netcdf3.py b/virtualizarr/tests/test_parsers/test_netcdf3.py index 1b7b25fb1..0a84a3f86 100644 --- a/virtualizarr/tests/test_parsers/test_netcdf3.py +++ b/virtualizarr/tests/test_parsers/test_netcdf3.py @@ -1,45 +1,30 @@ -import pytest import xarray as xr import xarray.testing as xrt from virtualizarr import open_virtual_dataset -from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.parsers import NetCDF3Parser from virtualizarr.tests import requires_network, requires_scipy from virtualizarr.tests.utils import obstore_http, obstore_local @requires_scipy -@pytest.mark.xfail( - reason="Big endian not yet supported by zarr-python 3.0" -) # https://github.com/zarr-developers/zarr-python/issues/2324 def test_read_netcdf3(netcdf3_file, array_v3_metadata): filepath = str(netcdf3_file) store = obstore_local(file_url=filepath) parser = NetCDF3Parser() - with open_virtual_dataset( - file_url=filepath, - parser=parser, - object_store=store, - ) as vds: - assert isinstance(vds, xr.Dataset) - assert list(vds.variables.keys()) == ["foo"] - assert isinstance(vds["foo"].data, ManifestArray) - - expected_manifest = ChunkManifest( - entries={"0": {"path": filepath, "offset": 80, "length": 12}} + with ( + parser(file_url=filepath, object_store=store) as manifest_store, + xr.open_dataset(filepath) as expected, + ): + observed = xr.open_dataset( + manifest_store, engine="zarr", consolidated=False, zarr_format=3 ) - metadata = array_v3_metadata(shape=(3,), chunks=(3,)) - expected_ma = ManifestArray(chunkmanifest=expected_manifest, metadata=metadata) - expected_vds = xr.Dataset({"foo": xr.Variable(data=expected_ma, dims=["x"])}) - - xrt.assert_identical(vds, expected_vds) + assert isinstance(observed, xr.Dataset) + assert list(observed.variables.keys()) == ["foo"] + xrt.assert_identical(observed.load(), expected.load()) @requires_network -@pytest.mark.xfail( - reason="Big endian not yet supported by zarr-python 3.0" -) # https://github.com/zarr-developers/zarr-python/issues/2324 def test_read_http_netcdf3(array_v3_metadata): file_url = "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc" store = obstore_http(file_url=file_url) From 81200ba0cdbc5a013f40e73546ebda3049fe04c4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sun, 29 Jun 2025 16:47:43 -0400 Subject: [PATCH 2/2] Mark test as requires_kerchunk --- virtualizarr/tests/test_parsers/test_netcdf3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_parsers/test_netcdf3.py b/virtualizarr/tests/test_parsers/test_netcdf3.py index 0a84a3f86..b3e67f2bf 100644 --- a/virtualizarr/tests/test_parsers/test_netcdf3.py +++ b/virtualizarr/tests/test_parsers/test_netcdf3.py @@ -3,7 +3,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.parsers import NetCDF3Parser -from virtualizarr.tests import requires_network, requires_scipy +from virtualizarr.tests import requires_kerchunk, requires_network, requires_scipy from virtualizarr.tests.utils import obstore_http, obstore_local @@ -24,6 +24,7 @@ def test_read_netcdf3(netcdf3_file, array_v3_metadata): xrt.assert_identical(observed.load(), expected.load()) +@requires_kerchunk @requires_network def test_read_http_netcdf3(array_v3_metadata): file_url = "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc"