Skip to content

Commit 81a76f0

Browse files
Switch to custom netcdf4/hdf5 backend (#395)
* Switch to custom netcdf4/hdf5 backend * Switches autodetected backend selection * updates tests to require kerchunk less often * only test kerchunk hdf reader if kerchunk is available * Allow for kerchunk-based backend * Rename to parametrize_over_hdf_backends * Run group tests * Respect dimensions without coordinates * Fix #402 so that nested groups are ignored * Encode #401 behavior in tests * Fix min deps tests * Make mypy happy * Add to release notes * combine two tests into one --------- Co-authored-by: TomNicholas <[email protected]>
1 parent 95fce11 commit 81a76f0

File tree

8 files changed

+107
-98
lines changed

8 files changed

+107
-98
lines changed

docs/releases.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ Breaking changes
3535
rather than positional or keyword. This change is breaking _only_ where arguments for
3636
these parameters are currently given positionally. (:issue:`341`) By
3737
`Chuck Daniels <https://github.com/chuckwondo>`_.
38+
- The default backend for netCDF4 and HDF5 is now the custom ``HDFVirtualBackend`` replacing
39+
the previous default which was a wrapper around the kerchunk backend.
40+
(:issue:`374`, :pull:`395`) By `Julia Signell <https://github.com/jsignell>`_.
3841

3942
Deprecations
4043
~~~~~~~~~~~~

virtualizarr/backend.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from virtualizarr.readers import (
1414
DMRPPVirtualBackend,
1515
FITSVirtualBackend,
16-
HDF5VirtualBackend,
16+
HDFVirtualBackend,
1717
KerchunkVirtualBackend,
1818
NetCDF3VirtualBackend,
1919
TIFFVirtualBackend,
@@ -27,9 +27,9 @@
2727
"kerchunk": KerchunkVirtualBackend,
2828
"zarr_v3": ZarrV3VirtualBackend,
2929
"dmrpp": DMRPPVirtualBackend,
30+
"hdf5": HDFVirtualBackend,
31+
"netcdf4": HDFVirtualBackend, # note this is the same as for hdf5
3032
# all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
31-
"hdf5": HDF5VirtualBackend,
32-
"netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5
3333
"netcdf3": NetCDF3VirtualBackend,
3434
"tiff": TIFFVirtualBackend,
3535
"fits": FITSVirtualBackend,

virtualizarr/readers/hdf/hdf.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -361,14 +361,14 @@ def _virtual_vars_from_hdf(
361361
).open_file()
362362
f = h5py.File(open_file, mode="r")
363363

364-
if group is not None:
364+
if group is not None and group != "":
365365
g = f[group]
366366
group_name = group
367367
if not isinstance(g, h5py.Group):
368368
raise ValueError("The provided group is not an HDF group")
369369
else:
370-
g = f
371-
group_name = ""
370+
g = f["/"]
371+
group_name = "/"
372372

373373
variables = {}
374374
for key in g.keys():
@@ -381,9 +381,6 @@ def _virtual_vars_from_hdf(
381381
)
382382
if variable is not None:
383383
variables[key] = variable
384-
else:
385-
raise NotImplementedError("Nested groups are not yet supported")
386-
387384
return variables
388385

389386
@staticmethod

virtualizarr/tests/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
from virtualizarr.manifests import ChunkManifest, ManifestArray
99
from virtualizarr.manifests.manifest import join
10+
from virtualizarr.readers import HDF5VirtualBackend
11+
from virtualizarr.readers.hdf import HDFVirtualBackend
1012
from virtualizarr.zarr import ZArray, ceildiv
1113

1214
requires_network = pytest.mark.network
@@ -42,6 +44,11 @@ def _importorskip(
4244
has_zarr_python, requires_zarr_python = _importorskip("zarr")
4345
has_zarr_python_v3, requires_zarr_python_v3 = _importorskip("zarr", "3.0.0b")
4446

47+
parametrize_over_hdf_backends = pytest.mark.parametrize(
48+
"hdf_backend",
49+
[HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend],
50+
)
51+
4552

4653
def create_manifestarray(
4754
shape: tuple[int, ...], chunks: tuple[int, ...]

virtualizarr/tests/test_backend.py

Lines changed: 62 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
from virtualizarr.readers.hdf import HDFVirtualBackend
1616
from virtualizarr.tests import (
1717
has_astropy,
18-
requires_kerchunk,
18+
parametrize_over_hdf_backends,
19+
requires_hdf5plugin,
20+
requires_imagecodecs,
1921
requires_network,
2022
requires_s3fs,
2123
requires_scipy,
@@ -82,13 +84,14 @@ def test_FileType():
8284
FileType(None)
8385

8486

85-
@requires_kerchunk
86-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
87+
@parametrize_over_hdf_backends
8788
class TestOpenVirtualDatasetIndexes:
8889
def test_no_indexes(self, netcdf4_file, hdf_backend):
8990
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
9091
assert vds.indexes == {}
9192

93+
@requires_hdf5plugin
94+
@requires_imagecodecs
9295
def test_create_default_indexes_for_loadable_variables(
9396
self, netcdf4_file, hdf_backend
9497
):
@@ -122,8 +125,9 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
122125
return True
123126

124127

125-
@requires_kerchunk
126-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
128+
@requires_hdf5plugin
129+
@requires_imagecodecs
130+
@parametrize_over_hdf_backends
127131
def test_cftime_index(tmpdir, hdf_backend):
128132
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
129133
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
@@ -152,8 +156,7 @@ def test_cftime_index(tmpdir, hdf_backend):
152156
assert vds.attrs == ds.attrs
153157

154158

155-
@requires_kerchunk
156-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
159+
@parametrize_over_hdf_backends
157160
class TestOpenVirtualDatasetAttrs:
158161
def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
159162
# regression test for GH issue #150
@@ -171,14 +174,16 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
171174
}
172175

173176

174-
@requires_kerchunk
177+
@parametrize_over_hdf_backends
175178
class TestDetermineCoords:
176-
def test_infer_one_dimensional_coords(self, netcdf4_file):
177-
vds = open_virtual_dataset(netcdf4_file, indexes={})
179+
def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend):
180+
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
178181
assert set(vds.coords) == {"time", "lat", "lon"}
179182

180-
def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
181-
vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={})
183+
def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend):
184+
vds = open_virtual_dataset(
185+
netcdf4_file_with_2d_coords, indexes={}, backend=hdf_backend
186+
)
182187

183188
expected_dimension_coords = ["ocean_time", "s_rho"]
184189
expected_2d_coords = ["lon_rho", "lat_rho", "h"]
@@ -189,6 +194,8 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
189194
+ expected_2d_coords
190195
+ expected_1d_non_dimension_coords
191196
+ expected_scalar_coords
197+
# These should not be included in coords see #401 for more information
198+
+ (["xi_rho", "eta_rho"] if hdf_backend == HDFVirtualBackend else [])
192199
)
193200
assert set(vds.coords) == set(expected_coords)
194201

@@ -199,7 +206,7 @@ class TestReadFromS3:
199206
@pytest.mark.parametrize(
200207
"indexes", [None, {}], ids=["None index", "empty dict index"]
201208
)
202-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
209+
@parametrize_over_hdf_backends
203210
def test_anon_read_s3(self, indexes, hdf_backend):
204211
"""Parameterized tests for empty vs supplied indexes and filetypes."""
205212
# TODO: Switch away from this s3 url after minIO is implemented.
@@ -217,7 +224,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):
217224

218225

219226
@requires_network
220-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
227+
@parametrize_over_hdf_backends
221228
class TestReadFromURL:
222229
@pytest.mark.parametrize(
223230
"filetype, url",
@@ -320,46 +327,55 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend):
320327
xrt.assert_equal(dsXR, dsV)
321328

322329

323-
@requires_kerchunk
324-
def test_open_empty_group(empty_netcdf4_file):
325-
vds = open_virtual_dataset(empty_netcdf4_file, indexes={})
326-
assert isinstance(vds, xr.Dataset)
327-
expected = Dataset()
328-
xrt.assert_identical(vds, expected)
329-
330-
331-
@requires_kerchunk
330+
@parametrize_over_hdf_backends
332331
class TestOpenVirtualDatasetHDFGroup:
333-
def test_open_subgroup(self, netcdf4_file_with_data_in_multiple_groups):
332+
def test_open_empty_group(self, empty_netcdf4_file, hdf_backend):
333+
vds = open_virtual_dataset(empty_netcdf4_file, indexes={}, backend=hdf_backend)
334+
assert isinstance(vds, xr.Dataset)
335+
expected = Dataset()
336+
xrt.assert_identical(vds, expected)
337+
338+
def test_open_subgroup(
339+
self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
340+
):
334341
vds = open_virtual_dataset(
335-
netcdf4_file_with_data_in_multiple_groups, group="subgroup", indexes={}
342+
netcdf4_file_with_data_in_multiple_groups,
343+
group="subgroup",
344+
indexes={},
345+
backend=hdf_backend,
346+
)
347+
# This should just be ["bar"] see #401 for more information
348+
assert list(vds.variables) == (
349+
["bar", "dim_0"] if hdf_backend == HDFVirtualBackend else ["bar"]
336350
)
337-
assert list(vds.variables) == ["bar"]
338351
assert isinstance(vds["bar"].data, ManifestArray)
339352
assert vds["bar"].shape == (2,)
340353

341-
def test_open_root_group_manually(self, netcdf4_file_with_data_in_multiple_groups):
342-
vds = open_virtual_dataset(
343-
netcdf4_file_with_data_in_multiple_groups, group="", indexes={}
344-
)
345-
assert list(vds.variables) == ["foo"]
346-
assert isinstance(vds["foo"].data, ManifestArray)
347-
assert vds["foo"].shape == (3,)
348-
349-
def test_open_root_group_by_default(
350-
self, netcdf4_file_with_data_in_multiple_groups
354+
@pytest.mark.parametrize("group", ["", None])
355+
def test_open_root_group(
356+
self,
357+
netcdf4_file_with_data_in_multiple_groups,
358+
hdf_backend,
359+
group,
351360
):
352361
vds = open_virtual_dataset(
353-
netcdf4_file_with_data_in_multiple_groups, indexes={}
362+
netcdf4_file_with_data_in_multiple_groups,
363+
group=group,
364+
indexes={},
365+
backend=hdf_backend,
366+
)
367+
# This should just be ["foo"] see #401 for more information
368+
assert list(vds.variables) == (
369+
["foo", "dim_0"] if hdf_backend == HDFVirtualBackend else ["foo"]
354370
)
355-
assert list(vds.variables) == ["foo"]
356371
assert isinstance(vds["foo"].data, ManifestArray)
357372
assert vds["foo"].shape == (3,)
358373

359374

360-
@requires_kerchunk
375+
@requires_hdf5plugin
376+
@requires_imagecodecs
361377
class TestLoadVirtualDataset:
362-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
378+
@parametrize_over_hdf_backends
363379
def test_loadable_variables(self, netcdf4_file, hdf_backend):
364380
vars_to_load = ["air", "time"]
365381
vds = open_virtual_dataset(
@@ -399,18 +415,18 @@ def test_explicit_filetype_and_backend(self, netcdf4_file):
399415
netcdf4_file, filetype="hdf", backend=HDFVirtualBackend
400416
)
401417

402-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
418+
@parametrize_over_hdf_backends
403419
def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
404420
if hdf_backend == HDFVirtualBackend:
405-
with pytest.raises(NotImplementedError, match="Nested groups"):
406-
open_virtual_dataset(hdf5_groups_file, backend=hdf_backend)
407421
with pytest.raises(KeyError, match="doesn't exist"):
408422
open_virtual_dataset(
409423
hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
410424
)
411425
if hdf_backend == HDF5VirtualBackend:
412426
with pytest.raises(ValueError, match="not found in"):
413-
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
427+
open_virtual_dataset(
428+
hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
429+
)
414430

415431
vars_to_load = ["air", "time"]
416432
vds = open_virtual_dataset(
@@ -443,13 +459,13 @@ def test_open_virtual_dataset_passes_expected_args(
443459
}
444460
mock_read_kerchunk.assert_called_once_with(**args)
445461

446-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
462+
@parametrize_over_hdf_backends
447463
def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend):
448464
vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
449465
assert vds.empty.dims == ()
450466
assert vds.empty.attrs == {"empty": "true"}
451467

452-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
468+
@parametrize_over_hdf_backends
453469
def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend):
454470
vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
455471
assert vds.scalar.dims == ()

virtualizarr/tests/test_integration.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88

99
from virtualizarr import open_virtual_dataset
1010
from virtualizarr.manifests import ChunkManifest, ManifestArray
11-
from virtualizarr.readers import HDF5VirtualBackend
12-
from virtualizarr.readers.hdf import HDFVirtualBackend
13-
from virtualizarr.tests import requires_kerchunk
11+
from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
1412
from virtualizarr.translators.kerchunk import (
1513
dataset_from_kerchunk_refs,
1614
)
@@ -61,7 +59,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
6159
),
6260
],
6361
)
64-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
62+
@parametrize_over_hdf_backends
6563
def test_numpy_arrays_to_inlined_kerchunk_refs(
6664
netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
6765
):
@@ -89,7 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
8987
@requires_kerchunk
9088
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
9189
class TestKerchunkRoundtrip:
92-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
90+
@parametrize_over_hdf_backends
9391
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
9492
# set up example xarray dataset
9593
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
@@ -122,7 +120,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
122120
for coord in ds.coords:
123121
assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
124122

125-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
123+
@parametrize_over_hdf_backends
126124
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
127125
def test_kerchunk_roundtrip_concat(
128126
self, tmpdir, format, hdf_backend, decode_times, time_vars
@@ -192,7 +190,7 @@ def test_kerchunk_roundtrip_concat(
192190
assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
193191
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
194192

195-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
193+
@parametrize_over_hdf_backends
196194
def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
197195
# regression test for GH issue #105
198196

@@ -278,8 +276,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
278276
assert roundtrip.a.attrs == ds.a.attrs
279277

280278

281-
@requires_kerchunk
282-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
279+
@parametrize_over_hdf_backends
283280
def test_open_scalar_variable(tmpdir, hdf_backend):
284281
# regression test for GH issue #100
285282

@@ -290,9 +287,8 @@ def test_open_scalar_variable(tmpdir, hdf_backend):
290287
assert vds["a"].shape == ()
291288

292289

290+
@parametrize_over_hdf_backends
293291
class TestPathsToURIs:
294-
@requires_kerchunk
295-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
296292
def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
297293
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
298294

@@ -302,8 +298,6 @@ def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
302298
path = manifest["0.0.0"]["path"]
303299
assert path == expected_path
304300

305-
@requires_kerchunk
306-
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
307301
def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend):
308302
relative_path = relpath(netcdf4_file)
309303
vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend)

0 commit comments

Comments
 (0)