Skip to content

Commit 8f64962

Browse files
authored
Fix subgroup dims in HDF reader (#366)
* regression test * pass regression test * fix other tests of internal methods * cleaning minor nits * test that opening an empty group raises a NotImplementedError * change incorrect-looking return type * release note * fix bad merge
1 parent 0d2d6ab commit 8f64962

File tree

3 files changed

+66
-16
lines changed

3 files changed

+66
-16
lines changed

docs/releases.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ Bug fixes
3535

3636
- Fix bug preventing generating references for the root group of a file when a subgroup exists.
3737
(:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
38+
- Fix bug in HDF reader where dimension names of dimensions in a subgroup would be incorrect.
39+
(:issue:`364`, :pull:`366`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
3840
- Fix bug in dmrpp reader so _FillValue is included in variables' encodings.
3941
(:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
4042
- Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.

virtualizarr/readers/hdf/hdf.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def add_chunk_info(blob):
169169
return chunk_manifest
170170

171171
@staticmethod
172-
def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
172+
def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
173173
"""
174174
Get a list of dimension scale names attached to input HDF5 dataset.
175175
@@ -181,10 +181,12 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
181181
----------
182182
dataset : h5py.Dataset
183183
An h5py dataset.
184+
group : str
185+
Name of the group we are pulling these dimensions from. Required for potentially removing subgroup prefixes.
184186
185187
Returns
186188
-------
187-
list
189+
list[str]
188190
List with HDF5 path names of dimension scales attached to input
189191
dataset.
190192
"""
@@ -208,7 +210,11 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
208210
# In this case, we mimic netCDF4 and assign phony dimension names.
209211
# See https://github.com/fsspec/kerchunk/issues/41
210212
dims.append(f"phony_dim_{n}")
211-
return dims
213+
214+
if not group.endswith("/"):
215+
group += "/"
216+
217+
return [dim.removeprefix(group) for dim in dims]
212218

213219
@staticmethod
214220
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
@@ -257,20 +263,25 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
257263
return attrs
258264

259265
@staticmethod
260-
def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]:
266+
def _dataset_to_variable(
267+
path: str,
268+
dataset: H5Dataset,
269+
group: str,
270+
) -> Optional[xr.Variable]:
261271
"""
262272
Extract an xarray Variable with ManifestArray data from an h5py dataset
263273
264274
Parameters
265275
----------
266276
dataset : h5py.Dataset
267277
An h5py dataset.
278+
group : str
279+
Name of the group containing this h5py.Dataset.
268280
269281
Returns
270282
-------
271283
list: xarray.Variable
272284
A list of xarray variables.
273-
274285
"""
275286
# This chunk determination logic mirrors zarr-python's create
276287
# https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
@@ -305,7 +316,7 @@ def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]
305316
shape=dataset.shape,
306317
zarr_format=2,
307318
)
308-
dims = HDFVirtualBackend._dataset_dims(dataset)
319+
dims = HDFVirtualBackend._dataset_dims(dataset, group=group)
309320
manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
310321
if manifest:
311322
marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
@@ -330,37 +341,44 @@ def _virtual_vars_from_hdf(
330341
----------
331342
path: str
332343
The path of the hdf5 file.
333-
group: str
334-
The name of the group for which to extract variables.
344+
group: str, optional
345+
The name of the group for which to extract variables. None refers to the root group.
335346
drop_variables: list of str
336347
A list of variable names to skip extracting.
337348
reader_options: dict
338-
A dictionary of reader options passed to fsspec when opening the
339-
file.
349+
A dictionary of reader options passed to fsspec when opening the file.
340350
341351
Returns
342352
-------
343353
dict
344354
A dictionary of Xarray Variables with the variable names as keys.
345-
346355
"""
347356
if drop_variables is None:
348357
drop_variables = []
358+
349359
open_file = _FsspecFSFromFilepath(
350360
filepath=path, reader_options=reader_options
351361
).open_file()
352362
f = h5py.File(open_file, mode="r")
353-
if group:
363+
364+
if group is not None:
354365
g = f[group]
366+
group_name = group
355367
if not isinstance(g, h5py.Group):
356368
raise ValueError("The provided group is not an HDF group")
357369
else:
358370
g = f
371+
group_name = ""
372+
359373
variables = {}
360374
for key in g.keys():
361375
if key not in drop_variables:
362376
if isinstance(g[key], h5py.Dataset):
363-
variable = HDFVirtualBackend._dataset_to_variable(path, g[key])
377+
variable = HDFVirtualBackend._dataset_to_variable(
378+
path=path,
379+
dataset=g[key],
380+
group=group_name,
381+
)
364382
if variable is not None:
365383
variables[key] = variable
366384
else:

virtualizarr/tests/test_readers/test_hdf/test_hdf.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import h5py # type: ignore
44
import pytest
55

6+
from virtualizarr import open_virtual_dataset
67
from virtualizarr.readers.hdf import HDFVirtualBackend
78
from virtualizarr.tests import (
89
requires_hdf5plugin,
@@ -90,22 +91,24 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
9091
f = h5py.File(chunked_dimensions_netcdf4_file)
9192
ds = f["data"]
9293
var = HDFVirtualBackend._dataset_to_variable(
93-
chunked_dimensions_netcdf4_file, ds
94+
chunked_dimensions_netcdf4_file, ds, group=""
9495
)
9596
assert var.chunks == (50, 50)
9697

9798
def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
9899
f = h5py.File(single_dimension_scale_hdf5_file)
99100
ds = f["data"]
100101
var = HDFVirtualBackend._dataset_to_variable(
101-
single_dimension_scale_hdf5_file, ds
102+
single_dimension_scale_hdf5_file, ds, group=""
102103
)
103104
assert var.chunks == (2,)
104105

105106
def test_dataset_attributes(self, string_attributes_hdf5_file):
106107
f = h5py.File(string_attributes_hdf5_file)
107108
ds = f["data"]
108-
var = HDFVirtualBackend._dataset_to_variable(string_attributes_hdf5_file, ds)
109+
var = HDFVirtualBackend._dataset_to_variable(
110+
string_attributes_hdf5_file, ds, group=""
111+
)
109112
assert var.attrs["attribute_name"] == "attribute_name"
110113

111114

@@ -178,3 +181,30 @@ def test_coord_names(
178181
maybe_open_loadable_vars_and_indexes.return_value = (0, 0)
179182
HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file)
180183
assert construct_virtual_dataset.call_args[1]["coord_names"] == ["lat", "lon"]
184+
185+
186+
@requires_hdf5plugin
187+
@requires_imagecodecs
188+
@pytest.mark.parametrize("group", ["subgroup", "subgroup/"])
189+
def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group):
190+
# regression test for GH issue #364
191+
vds = open_virtual_dataset(
192+
netcdf4_file_with_data_in_multiple_groups,
193+
group=group,
194+
backend=HDFVirtualBackend,
195+
)
196+
assert list(vds.dims) == ["dim_0"]
197+
198+
199+
@requires_hdf5plugin
200+
@requires_imagecodecs
201+
def test_nested_groups(hdf5_groups_file):
202+
# try to open an empty group
203+
with pytest.raises(
204+
NotImplementedError, match="Nested groups are not yet supported"
205+
):
206+
open_virtual_dataset(
207+
hdf5_groups_file,
208+
group="/",
209+
backend=HDFVirtualBackend,
210+
)

0 commit comments

Comments
 (0)