Update to zarr 3 and main kerchunk (#406)

jsignell · abarciauskas-bgse · web-flow · commit 83a6f10c6012 · 2025-01-31T16:41:40.000-07:00
* Use open_dataset_kerchunk in roundtrip tests that don't otherwise require kerchunk * Make it clear that integration tests require zarr-python * Add in-memory icechunk tests to existing roundtrip tests * Playing around with icechunk / zarr / xarray upgrade * Passing icechunk tests * Update tests to latest kerchunk * Remove icechunk roundtripping * Fixed some warnings * Fixed codec test * Fix warnings in test_backend.py * Tests passing * Remove obsolete comment * Add fill value to fixture * Remove obsolete conditional to ds.close() * Reset workflows with --cov * Reset conftest.py fixtures (air encoding) * Reset contributiong (--cov) removed * Remove context manager from readers/common.py * Reset test_backend with ds.dims * Reset test_icechunk (air encoding) * Fix change that snuck in on #395 --------- Co-authored-by: Aimee Barciauskas <aimee@developmentseed.org>
diff --git a/ci/upstream.yml b/ci/upstream.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - xarray>=2024.10.0,<2025.0.0
+  - xarray>=2025.1.1
   - h5netcdf
   - h5py
   - hdf5
@@ -29,6 +29,6 @@ dependencies:
   - fsspec
   - pip
   - pip:
-      - icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency
-      # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
+      - icechunk>=0.1.0a12 # Installs python-zarr v3 as dependency
+      - git+https://github.com/fsspec/kerchunk.git@main
       - imagecodecs-numcodecs==2024.6.1
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ classifiers = [
 requires-python = ">=3.10"
 dynamic = ["version"]
 dependencies = [
-    "xarray>=2024.10.0,<2025.0.0",
+    "xarray>=2025.1.1",
     "numpy>=2.0.0",
     "packaging",
     "universal-pathlib",
@@ -39,7 +39,7 @@ hdf_reader = [
     "numcodecs"
 ]
 icechunk = [
-    "icechunk==0.1.0a8",
+    "icechunk>=0.1.0a12",
 ]
 test = [
     "codecov",
diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py
@@ -55,7 +55,7 @@ def _get_manifestarray_codecs(
 ) -> Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]]:
     """Get codecs for a ManifestArray based on its zarr_format."""
     if normalize_to_zarr_v3 or array.zarray.zarr_format == 3:
-        return array.zarray._v3_codec_pipeline()
+        return (array.zarray.serializer(),) + array.zarray._v3_codec_pipeline()
     elif array.zarray.zarr_format == 2:
         return array.zarray.codec
     else:
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -35,7 +35,9 @@ def _importorskip(
 
 
 has_astropy, requires_astropy = _importorskip("astropy")
+has_icechunk, requires_icechunk = _importorskip("icechunk")
 has_kerchunk, requires_kerchunk = _importorskip("kerchunk")
+has_fastparquet, requires_fastparquet = _importorskip("fastparquet")
 has_s3fs, requires_s3fs = _importorskip("s3fs")
 has_scipy, requires_scipy = _importorskip("scipy")
 has_tifffile, requires_tifffile = _importorskip("tifffile")
diff --git a/virtualizarr/tests/test_codecs.py b/virtualizarr/tests/test_codecs.py
@@ -58,7 +58,9 @@ def test_manifest_array_zarr_v2_normalized(self):
 
         # Get codecs and verify
         actual_codecs = get_codecs(manifest_array, normalize_to_zarr_v3=True)
-        expected_codecs = manifest_array.zarray._v3_codec_pipeline()
+        expected_codecs = (
+            manifest_array.zarray.serializer(),
+        ) + manifest_array.zarray._v3_codec_pipeline()
         assert actual_codecs == expected_codecs
 
     @requires_zarr_python_v3
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -8,7 +8,13 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
+from virtualizarr.tests import (
+    has_fastparquet,
+    has_kerchunk,
+    parametrize_over_hdf_backends,
+    requires_kerchunk,
+    requires_zarr_python,
+)
 from virtualizarr.translators.kerchunk import (
     dataset_from_kerchunk_refs,
 )
@@ -34,16 +40,16 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
         ),
         chunkmanifest=manifest,
     )
-    ds = xr.Dataset({"a": (["x", "y"], marr)})
+    vds = xr.Dataset({"a": (["x", "y"], marr)})
 
     # Use accessor to write it out to kerchunk reference dict
-    ds_refs = ds.virtualize.to_kerchunk(format="dict")
+    ds_refs = vds.virtualize.to_kerchunk(format="dict")
 
     # Use dataset_from_kerchunk_refs to reconstruct the dataset
     roundtrip = dataset_from_kerchunk_refs(ds_refs)
 
     # Assert equal to original dataset
-    xrt.assert_equal(roundtrip, ds)
+    xrt.assert_equal(roundtrip, vds)
 
 
 @requires_kerchunk
@@ -84,11 +90,45 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
     assert refs["refs"]["time/0"] == expected["refs"]["time/0"]
 
 
-@requires_kerchunk
-@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
-class TestKerchunkRoundtrip:
+def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs):
+    # write those references to an in-memory kerchunk-formatted references dictionary
+    ds_refs = vds.virtualize.to_kerchunk(format="dict")
+
+    # use fsspec to read the dataset from the kerchunk references dict
+    return xr.open_dataset(ds_refs, engine="kerchunk", **kwargs)
+
+
+def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs):
+    # write those references to disk as kerchunk references format
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")
+
+    # use fsspec to read the dataset from disk via the kerchunk references
+    return xr.open_dataset(f"{tmpdir}/refs.json", engine="kerchunk", **kwargs)
+
+
+def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs):
+    # write those references to disk as kerchunk references format
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.parquet", format="parquet")
+
+    # use fsspec to read the dataset from disk via the kerchunk references
+    return xr.open_dataset(f"{tmpdir}/refs.parquet", engine="kerchunk", **kwargs)
+
+
+@requires_zarr_python
+@pytest.mark.parametrize(
+    "roundtrip_func",
+    [
+        *(
+            [roundtrip_as_kerchunk_dict, roundtrip_as_kerchunk_json]
+            if has_kerchunk
+            else []
+        ),
+        *([roundtrip_as_kerchunk_parquet] if has_kerchunk and has_fastparquet else []),
+    ],
+)
+class TestRoundtrip:
     @parametrize_over_hdf_backends
-    def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
+    def test_roundtrip_no_concat(self, tmpdir, roundtrip_func, hdf_backend):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
 
@@ -98,20 +138,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
         # use open_dataset_via_kerchunk to read it as references
         vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend)
 
-        if format == "dict":
-            # write those references to an in-memory kerchunk-formatted references dictionary
-            ds_refs = vds.virtualize.to_kerchunk(format=format)
-
-            # use fsspec to read the dataset from the kerchunk references dict
-            roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
-        else:
-            # write those references to disk as kerchunk references format
-            vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
-
-            # use fsspec to read the dataset from disk via the kerchunk references
-            roundtrip = xr.open_dataset(
-                f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
-            )
+        roundtrip = roundtrip_func(vds, tmpdir, decode_times=False)
 
         # assert all_close to original dataset
         xrt.assert_allclose(roundtrip, ds)
@@ -123,7 +150,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
     @parametrize_over_hdf_backends
     @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
     def test_kerchunk_roundtrip_concat(
-        self, tmpdir, format, hdf_backend, decode_times, time_vars
+        self, tmpdir, roundtrip_func, hdf_backend, decode_times, time_vars
     ):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
@@ -159,22 +186,7 @@ def test_kerchunk_roundtrip_concat(
         # concatenate virtually along time
         vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")
 
-        if format == "dict":
-            # write those references to an in-memory kerchunk-formatted references dictionary
-            ds_refs = vds.virtualize.to_kerchunk(format=format)
-
-            # use fsspec to read the dataset from the kerchunk references dict
-            roundtrip = xr.open_dataset(
-                ds_refs, engine="kerchunk", decode_times=decode_times
-            )
-        else:
-            # write those references to disk as kerchunk references format
-            vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
-
-            # use fsspec to read the dataset from disk via the kerchunk references
-            roundtrip = xr.open_dataset(
-                f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times
-            )
+        roundtrip = roundtrip_func(vds, tmpdir, decode_times=decode_times)
 
         if decode_times is False:
             # assert all_close to original dataset
@@ -191,7 +203,7 @@ def test_kerchunk_roundtrip_concat(
             assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
 
     @parametrize_over_hdf_backends
-    def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
+    def test_non_dimension_coordinates(self, tmpdir, roundtrip_func, hdf_backend):
         # regression test for GH issue #105
 
         if hdf_backend:
@@ -209,20 +221,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         assert "lat" in vds.coords
         assert "coordinates" not in vds.attrs
 
-        if format == "dict":
-            # write those references to an in-memory kerchunk-formatted references dictionary
-            ds_refs = vds.virtualize.to_kerchunk(format=format)
-
-            # use fsspec to read the dataset from the kerchunk references dict
-            roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
-        else:
-            # write those references to disk as kerchunk references format
-            vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
-
-            # use fsspec to read the dataset from disk via the kerchunk references
-            roundtrip = xr.open_dataset(
-                f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
-            )
+        roundtrip = roundtrip_func(vds, tmpdir)
 
         # assert equal to original dataset
         xrt.assert_allclose(roundtrip, ds)
@@ -231,7 +230,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         for coord in ds.coords:
             assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
-    def test_datetime64_dtype_fill_value(self, tmpdir, format):
+    def test_datetime64_dtype_fill_value(self, tmpdir, roundtrip_func):
         chunks_dict = {
             "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
         }
@@ -249,7 +248,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
             zarr_format=2,
         )
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-        ds = xr.Dataset(
+        vds = xr.Dataset(
             {
                 "a": xr.DataArray(
                     marr1,
@@ -260,20 +259,9 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
             }
         )
 
-        if format == "dict":
-            # write those references to an in-memory kerchunk-formatted references dictionary
-            ds_refs = ds.virtualize.to_kerchunk(format=format)
-
-            # use fsspec to read the dataset from the kerchunk references dict
-            roundtrip = xr.open_dataset(ds_refs, engine="kerchunk")
-        else:
-            # write those references to disk as kerchunk references format
-            ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
-
-            # use fsspec to read the dataset from disk via the kerchunk references
-            roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
+        roundtrip = roundtrip_func(vds, tmpdir)
 
-        assert roundtrip.a.attrs == ds.a.attrs
+        assert roundtrip.a.attrs == vds.a.attrs
 
 
 @parametrize_over_hdf_backends
diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
@@ -4,16 +4,22 @@
 
 import virtualizarr
 from virtualizarr.readers.hdf import HDFVirtualBackend
-from virtualizarr.tests import requires_kerchunk
+from virtualizarr.tests import (
+    requires_hdf5plugin,
+    requires_imagecodecs,
+    requires_kerchunk,
+)
 
 
 @requires_kerchunk
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestIntegration:
     @pytest.mark.xfail(
         reason="0 time start is being interpreted as fillvalue see issues/280"
     )
     def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_hdf5_file, backend=HDFVirtualBackend
+        self, tmpdir, filter_encoded_roundtrip_hdf5_file
     ):
         ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py
@@ -7,7 +7,7 @@
 
 from virtualizarr.backend import open_virtual_dataset
 from virtualizarr.manifests import ManifestArray
-from virtualizarr.tests import requires_kerchunk
+from virtualizarr.tests import has_fastparquet, requires_kerchunk
 
 
 def gen_ds_refs(
@@ -177,7 +177,7 @@ def test_handle_relative_paths(refs_file_factory):
 @requires_kerchunk
 @pytest.mark.parametrize(
     "reference_format",
-    ["json", "parquet", "invalid"],
+    ["json", "invalid", *(["parquet"] if has_fastparquet else [])],
 )
 def test_open_virtual_dataset_existing_kerchunk_refs(
     tmp_path, netcdf4_virtual_dataset, reference_format
diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py
@@ -3,7 +3,7 @@
 from xarray import Dataset
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.tests import requires_kerchunk
+from virtualizarr.tests import requires_fastparquet, requires_kerchunk
 
 
 @requires_kerchunk
@@ -108,6 +108,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
         }
         assert loaded_refs == expected_ds_refs
 
+    @requires_fastparquet
     def test_accessor_to_kerchunk_parquet(self, tmp_path):
         import ujson
 
diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py
@@ -42,7 +42,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset)
     assert isinstance(metadata["fill_value"], (bool, int, float, str, list))
     assert (
         isinstance(metadata["codecs"], list)
-        and len(metadata["codecs"]) > 1
+        and len(metadata["codecs"]) == 1
         and all(isconfigurable(codec) for codec in metadata["codecs"])
     )
 
diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py
@@ -243,13 +243,13 @@ def write_virtual_variable_to_icechunk(
     else:
         append_axis = None
         # create array if it doesn't already exist
-
         arr = group.require_array(
             name=name,
             shape=zarray.shape,
-            chunk_shape=zarray.chunks,
+            chunks=zarray.chunks,
             dtype=encode_dtype(zarray.dtype),
-            codecs=zarray._v3_codec_pipeline(),
+            compressors=zarray._v3_codec_pipeline(),  # compressors,
+            serializer=zarray.serializer(),
             dimension_names=var.dims,
             fill_value=zarray.fill_value,
         )
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset)`
`42`	`42`	`assert isinstance(metadata["fill_value"], (bool, int, float, str, list))`
`43`	`43`	`assert (`
`44`	`44`	`isinstance(metadata["codecs"], list)`
`45`		`- and len(metadata["codecs"]) > 1`
	`45`	`+ and len(metadata["codecs"]) == 1`
`46`	`46`	`and all(isconfigurable(codec) for codec in metadata["codecs"])`
`47`	`47`	`)`
`48`	`48`