diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index fc55c1bd..2b45cd9a 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -8,13 +8,11 @@ List, Mapping, Optional, - Tuple, Union, ) import numpy as np import xarray as xr -from xarray.backends.zarr import FillValueCoder from virtualizarr.manifests import ( ChunkEntry, @@ -42,20 +40,6 @@ H5Dataset: Any = None H5Group: Any = None -FillValueType = Union[ - int, - float, - bool, - complex, - str, - np.integer, - np.floating, - np.bool_, - np.complexfloating, - bytes, # For fixed-length string storage - Tuple[bytes, int], # Structured type -] - class HDFVirtualBackend(VirtualBackend): @staticmethod @@ -232,29 +216,6 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]: return [dim.removeprefix(group) for dim in dims] - @staticmethod - def _extract_cf_fill_value( - h5obj: Union[H5Dataset, H5Group], - ) -> Optional[FillValueType]: - """ - Convert the _FillValue attribute from an HDF5 group or dataset into - encoding. - - Parameters - ---------- - h5obj : h5py.Group or h5py.Dataset - An h5py group or dataset. - """ - fillvalue = None - for n, v in h5obj.attrs.items(): - if n == "_FillValue": - if isinstance(v, np.ndarray) and v.size == 1: - fillvalue = v.item() - else: - fillvalue = v - fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type] - return fillvalue - @staticmethod def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): """ @@ -279,14 +240,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): for n, v in h5obj.attrs.items(): if n in _HIDDEN_ATTRS: continue - if n == "_FillValue": - continue # Fix some attribute values to avoid JSON encoding exceptions... if isinstance(v, bytes): v = v.decode("utf-8") or " " elif isinstance(v, (np.ndarray, np.number, np.bool_)): if v.dtype.kind == "S": v = v.astype(str) + if n == "_FillValue": + continue elif v.size == 1: v = v.flatten()[0] if isinstance(v, (np.ndarray, np.number, np.bool_)): @@ -297,6 +258,7 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): v = "" if v == "DIMENSION_SCALE": continue + attrs[n] = v return attrs @@ -328,19 +290,21 @@ def _dataset_to_variable( codecs = codecs_from_dataset(dataset) cfcodec = cfcodec_from_dataset(dataset) attrs = HDFVirtualBackend._extract_attrs(dataset) - cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset) - attrs.pop("_FillValue", None) - if cfcodec: codecs.insert(0, cfcodec["codec"]) dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) + fill_value = cfcodec["codec"].decode(dataset.fillvalue) else: dtype = dataset.dtype - - fill_value = dataset.fillvalue.item() - + fill_value = dataset.fillvalue + if isinstance(fill_value, np.ndarray): + fill_value = fill_value[0] + if np.isnan(fill_value): + fill_value = float("nan") + if isinstance(fill_value, np.generic): + fill_value = fill_value.item() filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, # type: ignore @@ -359,8 +323,6 @@ def _dataset_to_variable( variable = xr.Variable(data=marray, dims=dims, attrs=attrs) else: variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs) - if cf_fill_value is not None: - variable.encoding["_FillValue"] = cf_fill_value return variable @staticmethod diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 4884db4a..8d0c1997 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -342,64 +342,3 @@ def non_coord_dim(tmpdir): ds = ds.drop_dims("dim3") ds.to_netcdf(filepath, engine="netcdf4") return filepath - - -@pytest.fixture -def scalar_fill_value_hdf5_file(tmpdir): - filepath = f"{tmpdir}/scalar_fill_value.nc" - f = h5py.File(filepath, "w") - data = np.random.randint(0, 10, size=(5)) - fill_value = 42 - f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value) - return filepath - - -compound_dtype = np.dtype( - [ - ("id", "i4"), # 4-byte integer - ("temperature", "f4"), # 4-byte float - ] -) - -compound_data = np.array( - [ - (1, 98.6), - (2, 101.3), - ], - dtype=compound_dtype, -) - -compound_fill = (-9999, -9999.0) - -fill_values = [ - {"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))}, - {"fill_value": -9999.0, "data": np.random.random(5)}, - {"fill_value": np.nan, "data": np.random.random(5)}, - {"fill_value": False, "data": np.array([True, False, False, True, True])}, - {"fill_value": "NaN", "data": np.array(["three"], dtype="S10")}, - {"fill_value": compound_fill, "data": compound_data}, -] - - -@pytest.fixture(params=fill_values) -def cf_fill_value_hdf5_file(tmpdir, request): - filepath = f"{tmpdir}/cf_fill_value.nc" - f = h5py.File(filepath, "w") - dset = f.create_dataset(name="data", data=request.param["data"], chunks=True) - dim_scale = f.create_dataset( - name="dim_scale", data=request.param["data"], chunks=True - ) - dim_scale.make_scale() - dset.dims[0].attach_scale(dim_scale) - dset.attrs["_FillValue"] = request.param["fill_value"] - return filepath - - -@pytest.fixture -def cf_array_fill_value_hdf5_file(tmpdir): - filepath = f"{tmpdir}/cf_array_fill_value.nc" - f = h5py.File(filepath, "w") - data = np.random.random(5) - dset = f.create_dataset(name="data", data=data, chunks=True) - dset.attrs["_FillValue"] = np.array([np.nan]) - return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py index d5146e88..598848a5 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py @@ -1,7 +1,6 @@ from unittest.mock import patch import h5py # type: ignore -import numpy as np import pytest from virtualizarr import open_virtual_dataset @@ -112,36 +111,6 @@ def test_dataset_attributes(self, string_attributes_hdf5_file): ) assert var.attrs["attribute_name"] == "attribute_name" - def test_scalar_fill_value(self, scalar_fill_value_hdf5_file): - f = h5py.File(scalar_fill_value_hdf5_file) - ds = f["data"] - var = HDFVirtualBackend._dataset_to_variable( - scalar_fill_value_hdf5_file, ds, group="" - ) - assert var.data.zarray.fill_value == 42 - - def test_cf_fill_value(self, cf_fill_value_hdf5_file): - f = h5py.File(cf_fill_value_hdf5_file) - ds = f["data"] - if ds.dtype.kind in "S": - pytest.xfail("Investigate fixed-length binary encoding in Zarr v3") - if ds.dtype.names: - pytest.xfail( - "To fix, structured dtype fill value encoding for Zarr backend" - ) - var = HDFVirtualBackend._dataset_to_variable( - cf_fill_value_hdf5_file, ds, group="" - ) - assert "_FillValue" in var.encoding - - def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file): - f = h5py.File(cf_array_fill_value_hdf5_file) - ds = f["data"] - var = HDFVirtualBackend._dataset_to_variable( - cf_array_fill_value_hdf5_file, ds, group="" - ) - assert not isinstance(var.encoding["_FillValue"], np.ndarray) - @requires_hdf5plugin @requires_imagecodecs diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index bf1aceb1..8583f210 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -6,11 +6,9 @@ from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( requires_hdf5plugin, - requires_icechunk, requires_imagecodecs, requires_kerchunk, ) -from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk @requires_kerchunk @@ -55,12 +53,8 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_allclose(ds, roundtrip) - assert ( - ds["temperature"].encoding["_FillValue"] - == roundtrip["temperature"].encoding["_FillValue"] - ) - def test_non_coord_dim_roundtrip(self, tmpdir, non_coord_dim): + def test_non_coord_dim(self, tmpdir, non_coord_dim): ds = xr.open_dataset(non_coord_dim) vds = virtualizarr.open_virtual_dataset( non_coord_dim, backend=HDFVirtualBackend @@ -69,18 +63,3 @@ def test_non_coord_dim_roundtrip(self, tmpdir, non_coord_dim): vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) - - @requires_icechunk - def test_cf_fill_value_roundtrip(self, tmpdir, cf_fill_value_hdf5_file): - ds = xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") - if ds["data"].dtype in [float, object]: - pytest.xfail( - "To fix handle fixed-length and structured type fill value \ - encoding in xarray zarr backend." - ) - vds = virtualizarr.open_virtual_dataset( - cf_fill_value_hdf5_file, - backend=HDFVirtualBackend, - ) - roundtrip = roundtrip_as_in_memory_icechunk(vds, tmpdir, decode_times=False) - xrt.assert_equal(ds, roundtrip)