Skip to content

Commit 1cdc2f0

Browse files
authored
Reapply HDF reader fillvalue (#460)
* Reapply "Align HDF reader CF _FillValue with Zarr v3 semantics (#420)" (#457) This reverts commit dbd3566. * Reapply HDF fill value changes Use context manager in tests to ensure resources are released.
1 parent 19caf2f commit 1cdc2f0

File tree

4 files changed

+214
-48
lines changed

4 files changed

+214
-48
lines changed

virtualizarr/readers/hdf/hdf.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88
List,
99
Mapping,
1010
Optional,
11+
Tuple,
1112
Union,
1213
)
1314

1415
import numpy as np
1516
import xarray as xr
17+
from xarray.backends.zarr import FillValueCoder
1618

1719
from virtualizarr.manifests import (
1820
ChunkEntry,
@@ -40,6 +42,20 @@
4042
H5Dataset: Any = None
4143
H5Group: Any = None
4244

45+
FillValueType = Union[
46+
int,
47+
float,
48+
bool,
49+
complex,
50+
str,
51+
np.integer,
52+
np.floating,
53+
np.bool_,
54+
np.complexfloating,
55+
bytes, # For fixed-length string storage
56+
Tuple[bytes, int], # Structured type
57+
]
58+
4359

4460
class HDFVirtualBackend(VirtualBackend):
4561
@staticmethod
@@ -216,6 +232,29 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
216232

217233
return [dim.removeprefix(group) for dim in dims]
218234

235+
@staticmethod
236+
def _extract_cf_fill_value(
237+
h5obj: Union[H5Dataset, H5Group],
238+
) -> Optional[FillValueType]:
239+
"""
240+
Convert the _FillValue attribute from an HDF5 group or dataset into
241+
encoding.
242+
243+
Parameters
244+
----------
245+
h5obj : h5py.Group or h5py.Dataset
246+
An h5py group or dataset.
247+
"""
248+
fillvalue = None
249+
for n, v in h5obj.attrs.items():
250+
if n == "_FillValue":
251+
if isinstance(v, np.ndarray) and v.size == 1:
252+
fillvalue = v.item()
253+
else:
254+
fillvalue = v
255+
fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type]
256+
return fillvalue
257+
219258
@staticmethod
220259
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
221260
"""
@@ -240,14 +279,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
240279
for n, v in h5obj.attrs.items():
241280
if n in _HIDDEN_ATTRS:
242281
continue
282+
if n == "_FillValue":
283+
continue
243284
# Fix some attribute values to avoid JSON encoding exceptions...
244285
if isinstance(v, bytes):
245286
v = v.decode("utf-8") or " "
246287
elif isinstance(v, (np.ndarray, np.number, np.bool_)):
247288
if v.dtype.kind == "S":
248289
v = v.astype(str)
249-
if n == "_FillValue":
250-
continue
251290
elif v.size == 1:
252291
v = v.flatten()[0]
253292
if isinstance(v, (np.ndarray, np.number, np.bool_)):
@@ -258,7 +297,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
258297
v = ""
259298
if v == "DIMENSION_SCALE":
260299
continue
261-
262300
attrs[n] = v
263301
return attrs
264302

@@ -290,21 +328,19 @@ def _dataset_to_variable(
290328
codecs = codecs_from_dataset(dataset)
291329
cfcodec = cfcodec_from_dataset(dataset)
292330
attrs = HDFVirtualBackend._extract_attrs(dataset)
331+
cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
332+
attrs.pop("_FillValue", None)
333+
293334
if cfcodec:
294335
codecs.insert(0, cfcodec["codec"])
295336
dtype = cfcodec["target_dtype"]
296337
attrs.pop("scale_factor", None)
297338
attrs.pop("add_offset", None)
298-
fill_value = cfcodec["codec"].decode(dataset.fillvalue)
299339
else:
300340
dtype = dataset.dtype
301-
fill_value = dataset.fillvalue
302-
if isinstance(fill_value, np.ndarray):
303-
fill_value = fill_value[0]
304-
if np.isnan(fill_value):
305-
fill_value = float("nan")
306-
if isinstance(fill_value, np.generic):
307-
fill_value = fill_value.item()
341+
342+
fill_value = dataset.fillvalue.item()
343+
308344
filters = [codec.get_config() for codec in codecs]
309345
zarray = ZArray(
310346
chunks=chunks, # type: ignore
@@ -323,6 +359,8 @@ def _dataset_to_variable(
323359
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
324360
else:
325361
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
362+
if cf_fill_value is not None:
363+
variable.encoding["_FillValue"] = cf_fill_value
326364
return variable
327365

328366
@staticmethod

virtualizarr/tests/test_readers/conftest.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,3 +342,64 @@ def non_coord_dim(tmpdir):
342342
ds = ds.drop_dims("dim3")
343343
ds.to_netcdf(filepath, engine="netcdf4")
344344
return filepath
345+
346+
347+
@pytest.fixture
348+
def scalar_fill_value_hdf5_file(tmpdir):
349+
filepath = f"{tmpdir}/scalar_fill_value.nc"
350+
f = h5py.File(filepath, "w")
351+
data = np.random.randint(0, 10, size=(5))
352+
fill_value = 42
353+
f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value)
354+
return filepath
355+
356+
357+
compound_dtype = np.dtype(
358+
[
359+
("id", "i4"), # 4-byte integer
360+
("temperature", "f4"), # 4-byte float
361+
]
362+
)
363+
364+
compound_data = np.array(
365+
[
366+
(1, 98.6),
367+
(2, 101.3),
368+
],
369+
dtype=compound_dtype,
370+
)
371+
372+
compound_fill = (-9999, -9999.0)
373+
374+
fill_values = [
375+
{"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))},
376+
{"fill_value": -9999.0, "data": np.random.random(5)},
377+
{"fill_value": np.nan, "data": np.random.random(5)},
378+
{"fill_value": False, "data": np.array([True, False, False, True, True])},
379+
{"fill_value": "NaN", "data": np.array(["three"], dtype="S10")},
380+
{"fill_value": compound_fill, "data": compound_data},
381+
]
382+
383+
384+
@pytest.fixture(params=fill_values)
385+
def cf_fill_value_hdf5_file(tmpdir, request):
386+
filepath = f"{tmpdir}/cf_fill_value.nc"
387+
f = h5py.File(filepath, "w")
388+
dset = f.create_dataset(name="data", data=request.param["data"], chunks=True)
389+
dim_scale = f.create_dataset(
390+
name="dim_scale", data=request.param["data"], chunks=True
391+
)
392+
dim_scale.make_scale()
393+
dset.dims[0].attach_scale(dim_scale)
394+
dset.attrs["_FillValue"] = request.param["fill_value"]
395+
return filepath
396+
397+
398+
@pytest.fixture
399+
def cf_array_fill_value_hdf5_file(tmpdir):
400+
filepath = f"{tmpdir}/cf_array_fill_value.nc"
401+
f = h5py.File(filepath, "w")
402+
data = np.random.random(5)
403+
dset = f.create_dataset(name="data", data=data, chunks=True)
404+
dset.attrs["_FillValue"] = np.array([np.nan])
405+
return filepath

virtualizarr/tests/test_readers/test_hdf/test_hdf.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from unittest.mock import patch
22

33
import h5py # type: ignore
4+
import numpy as np
45
import pytest
56

67
from virtualizarr import open_virtual_dataset
@@ -111,6 +112,36 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
111112
)
112113
assert var.attrs["attribute_name"] == "attribute_name"
113114

115+
def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
116+
f = h5py.File(scalar_fill_value_hdf5_file)
117+
ds = f["data"]
118+
var = HDFVirtualBackend._dataset_to_variable(
119+
scalar_fill_value_hdf5_file, ds, group=""
120+
)
121+
assert var.data.zarray.fill_value == 42
122+
123+
def test_cf_fill_value(self, cf_fill_value_hdf5_file):
124+
f = h5py.File(cf_fill_value_hdf5_file)
125+
ds = f["data"]
126+
if ds.dtype.kind in "S":
127+
pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
128+
if ds.dtype.names:
129+
pytest.xfail(
130+
"To fix, structured dtype fill value encoding for Zarr backend"
131+
)
132+
var = HDFVirtualBackend._dataset_to_variable(
133+
cf_fill_value_hdf5_file, ds, group=""
134+
)
135+
assert "_FillValue" in var.encoding
136+
137+
def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
138+
f = h5py.File(cf_array_fill_value_hdf5_file)
139+
ds = f["data"]
140+
var = HDFVirtualBackend._dataset_to_variable(
141+
cf_array_fill_value_hdf5_file, ds, group=""
142+
)
143+
assert not isinstance(var.encoding["_FillValue"], np.ndarray)
144+
114145

115146
@requires_hdf5plugin
116147
@requires_imagecodecs

virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py

Lines changed: 73 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from virtualizarr.readers.hdf import HDFVirtualBackend
77
from virtualizarr.tests import (
88
requires_hdf5plugin,
9+
requires_icechunk,
910
requires_imagecodecs,
1011
requires_kerchunk,
1112
)
13+
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk
1214

1315

1416
@requires_kerchunk
@@ -19,47 +21,81 @@ class TestIntegration:
1921
reason="0 time start is being interpreted as fillvalue see issues/280"
2022
)
2123
def test_filters_h5netcdf_roundtrip(
22-
self, tmpdir, filter_encoded_roundtrip_hdf5_file
24+
self, tmp_path, filter_encoded_roundtrip_hdf5_file
2325
):
24-
ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
25-
vds = virtualizarr.open_virtual_dataset(
26-
filter_encoded_roundtrip_hdf5_file,
27-
loadable_variables=["time"],
28-
cftime_variables=["time"],
29-
backend=HDFVirtualBackend,
30-
)
31-
kerchunk_file = f"{tmpdir}/kerchunk.json"
32-
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
33-
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
34-
xrt.assert_allclose(ds, roundtrip)
26+
with (
27+
xr.open_dataset(
28+
filter_encoded_roundtrip_hdf5_file, decode_times=True
29+
) as ds,
30+
virtualizarr.open_virtual_dataset(
31+
filter_encoded_roundtrip_hdf5_file,
32+
loadable_variables=["time"],
33+
cftime_variables=["time"],
34+
backend=HDFVirtualBackend,
35+
) as vds,
36+
):
37+
kerchunk_file = str(tmp_path / "kerchunk.json")
38+
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
39+
roundtrip = xr.open_dataset(
40+
kerchunk_file, engine="kerchunk", decode_times=True
41+
)
42+
xrt.assert_allclose(ds, roundtrip)
3543

3644
def test_filters_netcdf4_roundtrip(
37-
self, tmpdir, filter_encoded_roundtrip_netcdf4_file
45+
self, tmp_path, filter_encoded_roundtrip_netcdf4_file
3846
):
3947
filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
40-
ds = xr.open_dataset(filepath)
41-
vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend)
42-
kerchunk_file = f"{tmpdir}/kerchunk.json"
43-
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
44-
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
45-
xrt.assert_equal(ds, roundtrip)
48+
with (
49+
xr.open_dataset(filepath) as ds,
50+
virtualizarr.open_virtual_dataset(
51+
filepath, backend=HDFVirtualBackend
52+
) as vds,
53+
):
54+
kerchunk_file = str(tmp_path / "kerchunk.json")
55+
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
56+
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
57+
xrt.assert_equal(ds, roundtrip)
4658

47-
def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
48-
ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
49-
vds = virtualizarr.open_virtual_dataset(
50-
filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
51-
)
52-
kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
53-
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
54-
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
55-
xrt.assert_allclose(ds, roundtrip)
59+
def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file):
60+
with (
61+
xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds,
62+
virtualizarr.open_virtual_dataset(
63+
filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
64+
) as vds,
65+
):
66+
kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json")
67+
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
68+
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
69+
xrt.assert_allclose(ds, roundtrip)
70+
assert (
71+
ds["temperature"].encoding["_FillValue"]
72+
== roundtrip["temperature"].encoding["_FillValue"]
73+
)
5674

57-
def test_non_coord_dim(self, tmpdir, non_coord_dim):
58-
ds = xr.open_dataset(non_coord_dim)
59-
vds = virtualizarr.open_virtual_dataset(
60-
non_coord_dim, backend=HDFVirtualBackend
61-
)
62-
kerchunk_file = f"{tmpdir}/kerchunk.json"
63-
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
64-
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
65-
xrt.assert_equal(ds, roundtrip)
75+
def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim):
76+
with (
77+
xr.open_dataset(non_coord_dim) as ds,
78+
virtualizarr.open_virtual_dataset(
79+
non_coord_dim, backend=HDFVirtualBackend
80+
) as vds,
81+
):
82+
kerchunk_file = str(tmp_path / "kerchunk.json")
83+
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
84+
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
85+
xrt.assert_equal(ds, roundtrip)
86+
87+
@requires_icechunk
88+
def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file):
89+
with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds:
90+
if ds["data"].dtype in [float, object]:
91+
pytest.xfail(
92+
"TODO: fix handling fixed-length and structured type fill value"
93+
" encoding in xarray zarr backend."
94+
)
95+
with virtualizarr.open_virtual_dataset(
96+
cf_fill_value_hdf5_file, backend=HDFVirtualBackend
97+
) as vds:
98+
roundtrip = roundtrip_as_in_memory_icechunk(
99+
vds, tmp_path, decode_times=False
100+
)
101+
xrt.assert_equal(ds, roundtrip)

0 commit comments

Comments
 (0)