Skip to content

Commit 83a6f10

Browse files
Update to zarr 3 and main kerchunk (#406)
* Use open_dataset_kerchunk in roundtrip tests that don't otherwise require kerchunk * Make it clear that integration tests require zarr-python * Add in-memory icechunk tests to existing roundtrip tests * Playing around with icechunk / zarr / xarray upgrade * Passing icechunk tests * Update tests to latest kerchunk * Remove icechunk roundtripping * Fixed some warnings * Fixed codec test * Fix warnings in test_backend.py * Tests passing * Remove obsolete comment * Add fill value to fixture * Remove obsolete conditional to ds.close() * Reset workflows with --cov * Reset conftest.py fixtures (air encoding) * Reset contributiong (--cov) removed * Remove context manager from readers/common.py * Reset test_backend with ds.dims * Reset test_icechunk (air encoding) * Fix change that snuck in on #395 --------- Co-authored-by: Aimee Barciauskas <[email protected]>
1 parent 61847e9 commit 83a6f10

File tree

12 files changed

+101
-92
lines changed

12 files changed

+101
-92
lines changed

ci/upstream.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ channels:
33
- conda-forge
44
- nodefaults
55
dependencies:
6-
- xarray>=2024.10.0,<2025.0.0
6+
- xarray>=2025.1.1
77
- h5netcdf
88
- h5py
99
- hdf5
@@ -29,6 +29,6 @@ dependencies:
2929
- fsspec
3030
- pip
3131
- pip:
32-
- icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency
33-
# - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
32+
- icechunk>=0.1.0a12 # Installs python-zarr v3 as dependency
33+
- git+https://github.com/fsspec/kerchunk.git@main
3434
- imagecodecs-numcodecs==2024.6.1

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ classifiers = [
2121
requires-python = ">=3.10"
2222
dynamic = ["version"]
2323
dependencies = [
24-
"xarray>=2024.10.0,<2025.0.0",
24+
"xarray>=2025.1.1",
2525
"numpy>=2.0.0",
2626
"packaging",
2727
"universal-pathlib",
@@ -39,7 +39,7 @@ hdf_reader = [
3939
"numcodecs"
4040
]
4141
icechunk = [
42-
"icechunk==0.1.0a8",
42+
"icechunk>=0.1.0a12",
4343
]
4444
test = [
4545
"codecov",

virtualizarr/codecs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def _get_manifestarray_codecs(
5555
) -> Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]]:
5656
"""Get codecs for a ManifestArray based on its zarr_format."""
5757
if normalize_to_zarr_v3 or array.zarray.zarr_format == 3:
58-
return array.zarray._v3_codec_pipeline()
58+
return (array.zarray.serializer(),) + array.zarray._v3_codec_pipeline()
5959
elif array.zarray.zarr_format == 2:
6060
return array.zarray.codec
6161
else:

virtualizarr/tests/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ def _importorskip(
3535

3636

3737
has_astropy, requires_astropy = _importorskip("astropy")
38+
has_icechunk, requires_icechunk = _importorskip("icechunk")
3839
has_kerchunk, requires_kerchunk = _importorskip("kerchunk")
40+
has_fastparquet, requires_fastparquet = _importorskip("fastparquet")
3941
has_s3fs, requires_s3fs = _importorskip("s3fs")
4042
has_scipy, requires_scipy = _importorskip("scipy")
4143
has_tifffile, requires_tifffile = _importorskip("tifffile")

virtualizarr/tests/test_codecs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ def test_manifest_array_zarr_v2_normalized(self):
5858

5959
# Get codecs and verify
6060
actual_codecs = get_codecs(manifest_array, normalize_to_zarr_v3=True)
61-
expected_codecs = manifest_array.zarray._v3_codec_pipeline()
61+
expected_codecs = (
62+
manifest_array.zarray.serializer(),
63+
) + manifest_array.zarray._v3_codec_pipeline()
6264
assert actual_codecs == expected_codecs
6365

6466
@requires_zarr_python_v3

virtualizarr/tests/test_integration.py

Lines changed: 57 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@
88

99
from virtualizarr import open_virtual_dataset
1010
from virtualizarr.manifests import ChunkManifest, ManifestArray
11-
from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
11+
from virtualizarr.tests import (
12+
has_fastparquet,
13+
has_kerchunk,
14+
parametrize_over_hdf_backends,
15+
requires_kerchunk,
16+
requires_zarr_python,
17+
)
1218
from virtualizarr.translators.kerchunk import (
1319
dataset_from_kerchunk_refs,
1420
)
@@ -34,16 +40,16 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
3440
),
3541
chunkmanifest=manifest,
3642
)
37-
ds = xr.Dataset({"a": (["x", "y"], marr)})
43+
vds = xr.Dataset({"a": (["x", "y"], marr)})
3844

3945
# Use accessor to write it out to kerchunk reference dict
40-
ds_refs = ds.virtualize.to_kerchunk(format="dict")
46+
ds_refs = vds.virtualize.to_kerchunk(format="dict")
4147

4248
# Use dataset_from_kerchunk_refs to reconstruct the dataset
4349
roundtrip = dataset_from_kerchunk_refs(ds_refs)
4450

4551
# Assert equal to original dataset
46-
xrt.assert_equal(roundtrip, ds)
52+
xrt.assert_equal(roundtrip, vds)
4753

4854

4955
@requires_kerchunk
@@ -84,11 +90,45 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
8490
assert refs["refs"]["time/0"] == expected["refs"]["time/0"]
8591

8692

87-
@requires_kerchunk
88-
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
89-
class TestKerchunkRoundtrip:
93+
def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs):
94+
# write those references to an in-memory kerchunk-formatted references dictionary
95+
ds_refs = vds.virtualize.to_kerchunk(format="dict")
96+
97+
# use fsspec to read the dataset from the kerchunk references dict
98+
return xr.open_dataset(ds_refs, engine="kerchunk", **kwargs)
99+
100+
101+
def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs):
102+
# write those references to disk as kerchunk references format
103+
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")
104+
105+
# use fsspec to read the dataset from disk via the kerchunk references
106+
return xr.open_dataset(f"{tmpdir}/refs.json", engine="kerchunk", **kwargs)
107+
108+
109+
def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs):
110+
# write those references to disk as kerchunk references format
111+
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.parquet", format="parquet")
112+
113+
# use fsspec to read the dataset from disk via the kerchunk references
114+
return xr.open_dataset(f"{tmpdir}/refs.parquet", engine="kerchunk", **kwargs)
115+
116+
117+
@requires_zarr_python
118+
@pytest.mark.parametrize(
119+
"roundtrip_func",
120+
[
121+
*(
122+
[roundtrip_as_kerchunk_dict, roundtrip_as_kerchunk_json]
123+
if has_kerchunk
124+
else []
125+
),
126+
*([roundtrip_as_kerchunk_parquet] if has_kerchunk and has_fastparquet else []),
127+
],
128+
)
129+
class TestRoundtrip:
90130
@parametrize_over_hdf_backends
91-
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
131+
def test_roundtrip_no_concat(self, tmpdir, roundtrip_func, hdf_backend):
92132
# set up example xarray dataset
93133
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
94134

@@ -98,20 +138,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
98138
# use open_dataset_via_kerchunk to read it as references
99139
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend)
100140

101-
if format == "dict":
102-
# write those references to an in-memory kerchunk-formatted references dictionary
103-
ds_refs = vds.virtualize.to_kerchunk(format=format)
104-
105-
# use fsspec to read the dataset from the kerchunk references dict
106-
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
107-
else:
108-
# write those references to disk as kerchunk references format
109-
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
110-
111-
# use fsspec to read the dataset from disk via the kerchunk references
112-
roundtrip = xr.open_dataset(
113-
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
114-
)
141+
roundtrip = roundtrip_func(vds, tmpdir, decode_times=False)
115142

116143
# assert all_close to original dataset
117144
xrt.assert_allclose(roundtrip, ds)
@@ -123,7 +150,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
123150
@parametrize_over_hdf_backends
124151
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
125152
def test_kerchunk_roundtrip_concat(
126-
self, tmpdir, format, hdf_backend, decode_times, time_vars
153+
self, tmpdir, roundtrip_func, hdf_backend, decode_times, time_vars
127154
):
128155
# set up example xarray dataset
129156
ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
@@ -159,22 +186,7 @@ def test_kerchunk_roundtrip_concat(
159186
# concatenate virtually along time
160187
vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")
161188

162-
if format == "dict":
163-
# write those references to an in-memory kerchunk-formatted references dictionary
164-
ds_refs = vds.virtualize.to_kerchunk(format=format)
165-
166-
# use fsspec to read the dataset from the kerchunk references dict
167-
roundtrip = xr.open_dataset(
168-
ds_refs, engine="kerchunk", decode_times=decode_times
169-
)
170-
else:
171-
# write those references to disk as kerchunk references format
172-
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
173-
174-
# use fsspec to read the dataset from disk via the kerchunk references
175-
roundtrip = xr.open_dataset(
176-
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times
177-
)
189+
roundtrip = roundtrip_func(vds, tmpdir, decode_times=decode_times)
178190

179191
if decode_times is False:
180192
# assert all_close to original dataset
@@ -191,7 +203,7 @@ def test_kerchunk_roundtrip_concat(
191203
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
192204

193205
@parametrize_over_hdf_backends
194-
def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
206+
def test_non_dimension_coordinates(self, tmpdir, roundtrip_func, hdf_backend):
195207
# regression test for GH issue #105
196208

197209
if hdf_backend:
@@ -209,20 +221,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
209221
assert "lat" in vds.coords
210222
assert "coordinates" not in vds.attrs
211223

212-
if format == "dict":
213-
# write those references to an in-memory kerchunk-formatted references dictionary
214-
ds_refs = vds.virtualize.to_kerchunk(format=format)
215-
216-
# use fsspec to read the dataset from the kerchunk references dict
217-
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
218-
else:
219-
# write those references to disk as kerchunk references format
220-
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
221-
222-
# use fsspec to read the dataset from disk via the kerchunk references
223-
roundtrip = xr.open_dataset(
224-
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
225-
)
224+
roundtrip = roundtrip_func(vds, tmpdir)
226225

227226
# assert equal to original dataset
228227
xrt.assert_allclose(roundtrip, ds)
@@ -231,7 +230,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
231230
for coord in ds.coords:
232231
assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
233232

234-
def test_datetime64_dtype_fill_value(self, tmpdir, format):
233+
def test_datetime64_dtype_fill_value(self, tmpdir, roundtrip_func):
235234
chunks_dict = {
236235
"0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
237236
}
@@ -249,7 +248,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
249248
zarr_format=2,
250249
)
251250
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest)
252-
ds = xr.Dataset(
251+
vds = xr.Dataset(
253252
{
254253
"a": xr.DataArray(
255254
marr1,
@@ -260,20 +259,9 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
260259
}
261260
)
262261

263-
if format == "dict":
264-
# write those references to an in-memory kerchunk-formatted references dictionary
265-
ds_refs = ds.virtualize.to_kerchunk(format=format)
266-
267-
# use fsspec to read the dataset from the kerchunk references dict
268-
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk")
269-
else:
270-
# write those references to disk as kerchunk references format
271-
ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
272-
273-
# use fsspec to read the dataset from disk via the kerchunk references
274-
roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
262+
roundtrip = roundtrip_func(vds, tmpdir)
275263

276-
assert roundtrip.a.attrs == ds.a.attrs
264+
assert roundtrip.a.attrs == vds.a.attrs
277265

278266

279267
@parametrize_over_hdf_backends

virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,22 @@
44

55
import virtualizarr
66
from virtualizarr.readers.hdf import HDFVirtualBackend
7-
from virtualizarr.tests import requires_kerchunk
7+
from virtualizarr.tests import (
8+
requires_hdf5plugin,
9+
requires_imagecodecs,
10+
requires_kerchunk,
11+
)
812

913

1014
@requires_kerchunk
15+
@requires_hdf5plugin
16+
@requires_imagecodecs
1117
class TestIntegration:
1218
@pytest.mark.xfail(
1319
reason="0 time start is being interpreted as fillvalue see issues/280"
1420
)
1521
def test_filters_h5netcdf_roundtrip(
16-
self, tmpdir, filter_encoded_roundtrip_hdf5_file, backend=HDFVirtualBackend
22+
self, tmpdir, filter_encoded_roundtrip_hdf5_file
1723
):
1824
ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
1925
vds = virtualizarr.open_virtual_dataset(

virtualizarr/tests/test_readers/test_kerchunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from virtualizarr.backend import open_virtual_dataset
99
from virtualizarr.manifests import ManifestArray
10-
from virtualizarr.tests import requires_kerchunk
10+
from virtualizarr.tests import has_fastparquet, requires_kerchunk
1111

1212

1313
def gen_ds_refs(
@@ -177,7 +177,7 @@ def test_handle_relative_paths(refs_file_factory):
177177
@requires_kerchunk
178178
@pytest.mark.parametrize(
179179
"reference_format",
180-
["json", "parquet", "invalid"],
180+
["json", "invalid", *(["parquet"] if has_fastparquet else [])],
181181
)
182182
def test_open_virtual_dataset_existing_kerchunk_refs(
183183
tmp_path, netcdf4_virtual_dataset, reference_format

virtualizarr/tests/test_writers/test_kerchunk.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from xarray import Dataset
44

55
from virtualizarr.manifests import ChunkManifest, ManifestArray
6-
from virtualizarr.tests import requires_kerchunk
6+
from virtualizarr.tests import requires_fastparquet, requires_kerchunk
77

88

99
@requires_kerchunk
@@ -108,6 +108,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
108108
}
109109
assert loaded_refs == expected_ds_refs
110110

111+
@requires_fastparquet
111112
def test_accessor_to_kerchunk_parquet(self, tmp_path):
112113
import ujson
113114

virtualizarr/tests/test_writers/test_zarr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset)
4242
assert isinstance(metadata["fill_value"], (bool, int, float, str, list))
4343
assert (
4444
isinstance(metadata["codecs"], list)
45-
and len(metadata["codecs"]) > 1
45+
and len(metadata["codecs"]) == 1
4646
and all(isconfigurable(codec) for codec in metadata["codecs"])
4747
)
4848

0 commit comments

Comments
 (0)