Skip to content

Commit 61847e9

Browse files
Fix HDFVirtualBackend handling of non coordinate dimension HDF datasets. (#410)
* Do not create variables for non coordinate dimension hdf datasets. * Revert test changes to avoid HDFVirtualBackend errors from #395. * Re-enable xfailed roundtrip integration test. * Fix HDF5 type usage. * Fix indent error for scanning HDF5 items.
1 parent 9c3d0f9 commit 61847e9

File tree

5 files changed

+42
-16
lines changed

5 files changed

+42
-16
lines changed

virtualizarr/readers/hdf/hdf.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,10 @@ def _virtual_vars_from_hdf(
371371
group_name = "/"
372372

373373
variables = {}
374+
non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
375+
group=g
376+
)
377+
drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
374378
for key in g.keys():
375379
if key not in drop_variables:
376380
if isinstance(g[key], h5py.Dataset):
@@ -403,3 +407,17 @@ def _get_group_attrs(
403407
g = f
404408
attrs = HDFVirtualBackend._extract_attrs(g)
405409
return attrs
410+
411+
@staticmethod
412+
def _find_non_coord_dimension_vars(group: H5Group) -> List[str]:
413+
dimension_names = []
414+
non_coordinate_dimension_variables = []
415+
for name, obj in group.items():
416+
if "_Netcdf4Dimid" in obj.attrs:
417+
dimension_names.append(name)
418+
for name, obj in group.items():
419+
if type(obj) is h5py.Dataset:
420+
if obj.id.get_storage_size() == 0 and name in dimension_names:
421+
non_coordinate_dimension_variables.append(name)
422+
423+
return non_coordinate_dimension_variables

virtualizarr/tests/test_backend.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,6 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend):
194194
+ expected_2d_coords
195195
+ expected_1d_non_dimension_coords
196196
+ expected_scalar_coords
197-
# These should not be included in coords see #401 for more information
198-
+ (["xi_rho", "eta_rho"] if hdf_backend == HDFVirtualBackend else [])
199197
)
200198
assert set(vds.coords) == set(expected_coords)
201199

@@ -344,10 +342,7 @@ def test_open_subgroup(
344342
indexes={},
345343
backend=hdf_backend,
346344
)
347-
# This should just be ["bar"] see #401 for more information
348-
assert list(vds.variables) == (
349-
["bar", "dim_0"] if hdf_backend == HDFVirtualBackend else ["bar"]
350-
)
345+
assert list(vds.variables) == ["bar"]
351346
assert isinstance(vds["bar"].data, ManifestArray)
352347
assert vds["bar"].shape == (2,)
353348

@@ -364,10 +359,7 @@ def test_open_root_group(
364359
indexes={},
365360
backend=hdf_backend,
366361
)
367-
# This should just be ["foo"] see #401 for more information
368-
assert list(vds.variables) == (
369-
["foo", "dim_0"] if hdf_backend == HDFVirtualBackend else ["foo"]
370-
)
362+
assert list(vds.variables) == ["foo"]
371363
assert isinstance(vds["foo"].data, ManifestArray)
372364
assert vds["foo"].shape == (3,)
373365

virtualizarr/tests/test_readers/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,12 @@ def netcdf3_file(tmp_path: pathlib.Path) -> pathlib.Path:
333333
ds.to_netcdf(filepath, format="NETCDF3_CLASSIC")
334334

335335
return filepath
336+
337+
338+
@pytest.fixture
339+
def non_coord_dim(tmpdir):
340+
filepath = f"{tmpdir}/non_coord_dim.nc"
341+
ds = create_test_data(dim_sizes=(20, 80, 10))
342+
ds = ds.drop_dims("dim3")
343+
ds.to_netcdf(filepath, engine="netcdf4")
344+
return filepath

virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@ def test_filters_h5netcdf_roundtrip(
2727
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
2828
xrt.assert_allclose(ds, roundtrip)
2929

30-
@pytest.mark.xfail(
31-
reason="Coordinate issue affecting only hdf reader see pull/#260"
32-
)
3330
def test_filters_netcdf4_roundtrip(
3431
self, tmpdir, filter_encoded_roundtrip_netcdf4_file
3532
):
@@ -50,3 +47,13 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file
5047
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
5148
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
5249
xrt.assert_allclose(ds, roundtrip)
50+
51+
def test_non_coord_dim(self, tmpdir, non_coord_dim):
52+
ds = xr.open_dataset(non_coord_dim)
53+
vds = virtualizarr.open_virtual_dataset(
54+
non_coord_dim, backend=HDFVirtualBackend
55+
)
56+
kerchunk_file = f"{tmpdir}/kerchunk.json"
57+
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
58+
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
59+
xrt.assert_equal(ds, roundtrip)

virtualizarr/tests/test_xarray.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,11 +322,11 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(
322322
@requires_imagecodecs
323323
def test_nbytes(simple_netcdf4):
324324
vds = open_virtual_dataset(simple_netcdf4)
325-
assert vds.virtualize.nbytes == 88
326-
assert vds.nbytes == 104
325+
assert vds.virtualize.nbytes == 32
326+
assert vds.nbytes == 48
327327

328328
vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"])
329-
assert vds.virtualize.nbytes == 104
329+
assert vds.virtualize.nbytes == 48
330330

331331
ds = open_dataset(simple_netcdf4)
332332
assert ds.virtualize.nbytes == ds.nbytes

0 commit comments

Comments
 (0)