Skip to content

Commit c461b33

Browse files
Align HDF reader CF _FillValue with Zarr v3 semantics (#420)
* Remove unneccesary h5py fillvalue extraction logic. * Convert CF _FillValue attr to variable encoding for HDF files. * Handle xarray zarr backend decoding logic. * Include temporary broken integration tests for _FillValue. * Include _FillValue encoding intergration tests for HDFVirtualBackend.
1 parent 001ddc1 commit c461b33

File tree

4 files changed

+163
-12
lines changed

4 files changed

+163
-12
lines changed

virtualizarr/readers/hdf/hdf.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88
List,
99
Mapping,
1010
Optional,
11+
Tuple,
1112
Union,
1213
)
1314

1415
import numpy as np
1516
import xarray as xr
17+
from xarray.backends.zarr import FillValueCoder
1618

1719
from virtualizarr.manifests import (
1820
ChunkEntry,
@@ -40,6 +42,20 @@
4042
H5Dataset: Any = None
4143
H5Group: Any = None
4244

45+
FillValueType = Union[
46+
int,
47+
float,
48+
bool,
49+
complex,
50+
str,
51+
np.integer,
52+
np.floating,
53+
np.bool_,
54+
np.complexfloating,
55+
bytes, # For fixed-length string storage
56+
Tuple[bytes, int], # Structured type
57+
]
58+
4359

4460
class HDFVirtualBackend(VirtualBackend):
4561
@staticmethod
@@ -216,6 +232,29 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
216232

217233
return [dim.removeprefix(group) for dim in dims]
218234

235+
@staticmethod
236+
def _extract_cf_fill_value(
237+
h5obj: Union[H5Dataset, H5Group],
238+
) -> Optional[FillValueType]:
239+
"""
240+
Convert the _FillValue attribute from an HDF5 group or dataset into
241+
encoding.
242+
243+
Parameters
244+
----------
245+
h5obj : h5py.Group or h5py.Dataset
246+
An h5py group or dataset.
247+
"""
248+
fillvalue = None
249+
for n, v in h5obj.attrs.items():
250+
if n == "_FillValue":
251+
if isinstance(v, np.ndarray) and v.size == 1:
252+
fillvalue = v.item()
253+
else:
254+
fillvalue = v
255+
fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type]
256+
return fillvalue
257+
219258
@staticmethod
220259
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
221260
"""
@@ -240,14 +279,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
240279
for n, v in h5obj.attrs.items():
241280
if n in _HIDDEN_ATTRS:
242281
continue
282+
if n == "_FillValue":
283+
continue
243284
# Fix some attribute values to avoid JSON encoding exceptions...
244285
if isinstance(v, bytes):
245286
v = v.decode("utf-8") or " "
246287
elif isinstance(v, (np.ndarray, np.number, np.bool_)):
247288
if v.dtype.kind == "S":
248289
v = v.astype(str)
249-
if n == "_FillValue":
250-
continue
251290
elif v.size == 1:
252291
v = v.flatten()[0]
253292
if isinstance(v, (np.ndarray, np.number, np.bool_)):
@@ -258,7 +297,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
258297
v = ""
259298
if v == "DIMENSION_SCALE":
260299
continue
261-
262300
attrs[n] = v
263301
return attrs
264302

@@ -290,21 +328,19 @@ def _dataset_to_variable(
290328
codecs = codecs_from_dataset(dataset)
291329
cfcodec = cfcodec_from_dataset(dataset)
292330
attrs = HDFVirtualBackend._extract_attrs(dataset)
331+
cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
332+
attrs.pop("_FillValue", None)
333+
293334
if cfcodec:
294335
codecs.insert(0, cfcodec["codec"])
295336
dtype = cfcodec["target_dtype"]
296337
attrs.pop("scale_factor", None)
297338
attrs.pop("add_offset", None)
298-
fill_value = cfcodec["codec"].decode(dataset.fillvalue)
299339
else:
300340
dtype = dataset.dtype
301-
fill_value = dataset.fillvalue
302-
if isinstance(fill_value, np.ndarray):
303-
fill_value = fill_value[0]
304-
if np.isnan(fill_value):
305-
fill_value = float("nan")
306-
if isinstance(fill_value, np.generic):
307-
fill_value = fill_value.item()
341+
342+
fill_value = dataset.fillvalue.item()
343+
308344
filters = [codec.get_config() for codec in codecs]
309345
zarray = ZArray(
310346
chunks=chunks, # type: ignore
@@ -323,6 +359,8 @@ def _dataset_to_variable(
323359
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
324360
else:
325361
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
362+
if cf_fill_value is not None:
363+
variable.encoding["_FillValue"] = cf_fill_value
326364
return variable
327365

328366
@staticmethod

virtualizarr/tests/test_readers/conftest.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,3 +342,64 @@ def non_coord_dim(tmpdir):
342342
ds = ds.drop_dims("dim3")
343343
ds.to_netcdf(filepath, engine="netcdf4")
344344
return filepath
345+
346+
347+
@pytest.fixture
348+
def scalar_fill_value_hdf5_file(tmpdir):
349+
filepath = f"{tmpdir}/scalar_fill_value.nc"
350+
f = h5py.File(filepath, "w")
351+
data = np.random.randint(0, 10, size=(5))
352+
fill_value = 42
353+
f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value)
354+
return filepath
355+
356+
357+
compound_dtype = np.dtype(
358+
[
359+
("id", "i4"), # 4-byte integer
360+
("temperature", "f4"), # 4-byte float
361+
]
362+
)
363+
364+
compound_data = np.array(
365+
[
366+
(1, 98.6),
367+
(2, 101.3),
368+
],
369+
dtype=compound_dtype,
370+
)
371+
372+
compound_fill = (-9999, -9999.0)
373+
374+
fill_values = [
375+
{"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))},
376+
{"fill_value": -9999.0, "data": np.random.random(5)},
377+
{"fill_value": np.nan, "data": np.random.random(5)},
378+
{"fill_value": False, "data": np.array([True, False, False, True, True])},
379+
{"fill_value": "NaN", "data": np.array(["three"], dtype="S10")},
380+
{"fill_value": compound_fill, "data": compound_data},
381+
]
382+
383+
384+
@pytest.fixture(params=fill_values)
385+
def cf_fill_value_hdf5_file(tmpdir, request):
386+
filepath = f"{tmpdir}/cf_fill_value.nc"
387+
f = h5py.File(filepath, "w")
388+
dset = f.create_dataset(name="data", data=request.param["data"], chunks=True)
389+
dim_scale = f.create_dataset(
390+
name="dim_scale", data=request.param["data"], chunks=True
391+
)
392+
dim_scale.make_scale()
393+
dset.dims[0].attach_scale(dim_scale)
394+
dset.attrs["_FillValue"] = request.param["fill_value"]
395+
return filepath
396+
397+
398+
@pytest.fixture
399+
def cf_array_fill_value_hdf5_file(tmpdir):
400+
filepath = f"{tmpdir}/cf_array_fill_value.nc"
401+
f = h5py.File(filepath, "w")
402+
data = np.random.random(5)
403+
dset = f.create_dataset(name="data", data=data, chunks=True)
404+
dset.attrs["_FillValue"] = np.array([np.nan])
405+
return filepath

virtualizarr/tests/test_readers/test_hdf/test_hdf.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from unittest.mock import patch
22

33
import h5py # type: ignore
4+
import numpy as np
45
import pytest
56

67
from virtualizarr import open_virtual_dataset
@@ -111,6 +112,36 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
111112
)
112113
assert var.attrs["attribute_name"] == "attribute_name"
113114

115+
def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
116+
f = h5py.File(scalar_fill_value_hdf5_file)
117+
ds = f["data"]
118+
var = HDFVirtualBackend._dataset_to_variable(
119+
scalar_fill_value_hdf5_file, ds, group=""
120+
)
121+
assert var.data.zarray.fill_value == 42
122+
123+
def test_cf_fill_value(self, cf_fill_value_hdf5_file):
124+
f = h5py.File(cf_fill_value_hdf5_file)
125+
ds = f["data"]
126+
if ds.dtype.kind in "S":
127+
pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
128+
if ds.dtype.names:
129+
pytest.xfail(
130+
"To fix, structured dtype fill value encoding for Zarr backend"
131+
)
132+
var = HDFVirtualBackend._dataset_to_variable(
133+
cf_fill_value_hdf5_file, ds, group=""
134+
)
135+
assert "_FillValue" in var.encoding
136+
137+
def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
138+
f = h5py.File(cf_array_fill_value_hdf5_file)
139+
ds = f["data"]
140+
var = HDFVirtualBackend._dataset_to_variable(
141+
cf_array_fill_value_hdf5_file, ds, group=""
142+
)
143+
assert not isinstance(var.encoding["_FillValue"], np.ndarray)
144+
114145

115146
@requires_hdf5plugin
116147
@requires_imagecodecs

virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from virtualizarr.readers.hdf import HDFVirtualBackend
77
from virtualizarr.tests import (
88
requires_hdf5plugin,
9+
requires_icechunk,
910
requires_imagecodecs,
1011
requires_kerchunk,
1112
)
13+
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk
1214

1315

1416
@requires_kerchunk
@@ -53,8 +55,12 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file
5355
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
5456
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
5557
xrt.assert_allclose(ds, roundtrip)
58+
assert (
59+
ds["temperature"].encoding["_FillValue"]
60+
== roundtrip["temperature"].encoding["_FillValue"]
61+
)
5662

57-
def test_non_coord_dim(self, tmpdir, non_coord_dim):
63+
def test_non_coord_dim_roundtrip(self, tmpdir, non_coord_dim):
5864
ds = xr.open_dataset(non_coord_dim)
5965
vds = virtualizarr.open_virtual_dataset(
6066
non_coord_dim, backend=HDFVirtualBackend
@@ -63,3 +69,18 @@ def test_non_coord_dim(self, tmpdir, non_coord_dim):
6369
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
6470
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
6571
xrt.assert_equal(ds, roundtrip)
72+
73+
@requires_icechunk
74+
def test_cf_fill_value_roundtrip(self, tmpdir, cf_fill_value_hdf5_file):
75+
ds = xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf")
76+
if ds["data"].dtype in [float, object]:
77+
pytest.xfail(
78+
"To fix handle fixed-length and structured type fill value \
79+
encoding in xarray zarr backend."
80+
)
81+
vds = virtualizarr.open_virtual_dataset(
82+
cf_fill_value_hdf5_file,
83+
backend=HDFVirtualBackend,
84+
)
85+
roundtrip = roundtrip_as_in_memory_icechunk(vds, tmpdir, decode_times=False)
86+
xrt.assert_equal(ds, roundtrip)

0 commit comments

Comments
 (0)