Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 49 additions & 11 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
List,
Mapping,
Optional,
Tuple,
Union,
)

import numpy as np
import xarray as xr
from xarray.backends.zarr import FillValueCoder

from virtualizarr.manifests import (
ChunkEntry,
Expand Down Expand Up @@ -40,6 +42,20 @@
H5Dataset: Any = None
H5Group: Any = None

FillValueType = Union[
int,
float,
bool,
complex,
str,
np.integer,
np.floating,
np.bool_,
np.complexfloating,
bytes, # For fixed-length string storage
Tuple[bytes, int], # Structured type
]


class HDFVirtualBackend(VirtualBackend):
@staticmethod
Expand Down Expand Up @@ -216,6 +232,29 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:

return [dim.removeprefix(group) for dim in dims]

@staticmethod
def _extract_cf_fill_value(
h5obj: Union[H5Dataset, H5Group],
) -> Optional[FillValueType]:
"""
Convert the _FillValue attribute from an HDF5 group or dataset into
encoding.

Parameters
----------
h5obj : h5py.Group or h5py.Dataset
An h5py group or dataset.
"""
fillvalue = None
for n, v in h5obj.attrs.items():
if n == "_FillValue":
if isinstance(v, np.ndarray) and v.size == 1:
fillvalue = v.item()
else:
fillvalue = v
fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type]
return fillvalue

@staticmethod
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
"""
Expand All @@ -240,14 +279,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
for n, v in h5obj.attrs.items():
if n in _HIDDEN_ATTRS:
continue
if n == "_FillValue":
continue
# Fix some attribute values to avoid JSON encoding exceptions...
if isinstance(v, bytes):
v = v.decode("utf-8") or " "
elif isinstance(v, (np.ndarray, np.number, np.bool_)):
if v.dtype.kind == "S":
v = v.astype(str)
if n == "_FillValue":
continue
elif v.size == 1:
v = v.flatten()[0]
if isinstance(v, (np.ndarray, np.number, np.bool_)):
Expand All @@ -258,7 +297,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
v = ""
if v == "DIMENSION_SCALE":
continue

attrs[n] = v
return attrs

Expand Down Expand Up @@ -290,21 +328,19 @@ def _dataset_to_variable(
codecs = codecs_from_dataset(dataset)
cfcodec = cfcodec_from_dataset(dataset)
attrs = HDFVirtualBackend._extract_attrs(dataset)
cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
attrs.pop("_FillValue", None)

if cfcodec:
codecs.insert(0, cfcodec["codec"])
dtype = cfcodec["target_dtype"]
attrs.pop("scale_factor", None)
attrs.pop("add_offset", None)
fill_value = cfcodec["codec"].decode(dataset.fillvalue)
else:
dtype = dataset.dtype
fill_value = dataset.fillvalue
if isinstance(fill_value, np.ndarray):
fill_value = fill_value[0]
if np.isnan(fill_value):
fill_value = float("nan")
if isinstance(fill_value, np.generic):
fill_value = fill_value.item()

fill_value = dataset.fillvalue.item()

filters = [codec.get_config() for codec in codecs]
zarray = ZArray(
chunks=chunks, # type: ignore
Expand All @@ -323,6 +359,8 @@ def _dataset_to_variable(
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
else:
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
if cf_fill_value is not None:
variable.encoding["_FillValue"] = cf_fill_value
return variable

@staticmethod
Expand Down
61 changes: 61 additions & 0 deletions virtualizarr/tests/test_readers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,3 +342,64 @@ def non_coord_dim(tmpdir):
ds = ds.drop_dims("dim3")
ds.to_netcdf(filepath, engine="netcdf4")
return filepath


@pytest.fixture
def scalar_fill_value_hdf5_file(tmpdir):
filepath = f"{tmpdir}/scalar_fill_value.nc"
f = h5py.File(filepath, "w")
data = np.random.randint(0, 10, size=(5))
fill_value = 42
f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value)
return filepath


compound_dtype = np.dtype(
[
("id", "i4"), # 4-byte integer
("temperature", "f4"), # 4-byte float
]
)

compound_data = np.array(
[
(1, 98.6),
(2, 101.3),
],
dtype=compound_dtype,
)

compound_fill = (-9999, -9999.0)

fill_values = [
{"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))},
{"fill_value": -9999.0, "data": np.random.random(5)},
{"fill_value": np.nan, "data": np.random.random(5)},
{"fill_value": False, "data": np.array([True, False, False, True, True])},
{"fill_value": "NaN", "data": np.array(["three"], dtype="S10")},
{"fill_value": compound_fill, "data": compound_data},
]


@pytest.fixture(params=fill_values)
def cf_fill_value_hdf5_file(tmpdir, request):
filepath = f"{tmpdir}/cf_fill_value.nc"
f = h5py.File(filepath, "w")
dset = f.create_dataset(name="data", data=request.param["data"], chunks=True)
dim_scale = f.create_dataset(
name="dim_scale", data=request.param["data"], chunks=True
)
dim_scale.make_scale()
dset.dims[0].attach_scale(dim_scale)
dset.attrs["_FillValue"] = request.param["fill_value"]
return filepath


@pytest.fixture
def cf_array_fill_value_hdf5_file(tmpdir):
filepath = f"{tmpdir}/cf_array_fill_value.nc"
f = h5py.File(filepath, "w")
data = np.random.random(5)
dset = f.create_dataset(name="data", data=data, chunks=True)
dset.attrs["_FillValue"] = np.array([np.nan])
return filepath
31 changes: 31 additions & 0 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch

import h5py # type: ignore
import numpy as np
import pytest

from virtualizarr import open_virtual_dataset
Expand Down Expand Up @@ -111,6 +112,36 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
)
assert var.attrs["attribute_name"] == "attribute_name"

def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
f = h5py.File(scalar_fill_value_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
scalar_fill_value_hdf5_file, ds, group=""
)
assert var.data.zarray.fill_value == 42

def test_cf_fill_value(self, cf_fill_value_hdf5_file):
f = h5py.File(cf_fill_value_hdf5_file)
ds = f["data"]
if ds.dtype.kind in "S":
pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
if ds.dtype.names:
pytest.xfail(
"To fix, structured dtype fill value encoding for Zarr backend"
)
var = HDFVirtualBackend._dataset_to_variable(
cf_fill_value_hdf5_file, ds, group=""
)
assert "_FillValue" in var.encoding

def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
f = h5py.File(cf_array_fill_value_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
cf_array_fill_value_hdf5_file, ds, group=""
)
assert not isinstance(var.encoding["_FillValue"], np.ndarray)


@requires_hdf5plugin
@requires_imagecodecs
Expand Down
110 changes: 73 additions & 37 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
requires_hdf5plugin,
requires_icechunk,
requires_imagecodecs,
requires_kerchunk,
)
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk


@requires_kerchunk
Expand All @@ -19,47 +21,81 @@ class TestIntegration:
reason="0 time start is being interpreted as fillvalue see issues/280"
)
def test_filters_h5netcdf_roundtrip(
self, tmpdir, filter_encoded_roundtrip_hdf5_file
self, tmp_path, filter_encoded_roundtrip_hdf5_file
):
ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
vds = virtualizarr.open_virtual_dataset(
filter_encoded_roundtrip_hdf5_file,
loadable_variables=["time"],
cftime_variables=["time"],
backend=HDFVirtualBackend,
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
xrt.assert_allclose(ds, roundtrip)
with (
xr.open_dataset(
filter_encoded_roundtrip_hdf5_file, decode_times=True
) as ds,
virtualizarr.open_virtual_dataset(
filter_encoded_roundtrip_hdf5_file,
loadable_variables=["time"],
cftime_variables=["time"],
backend=HDFVirtualBackend,
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(
kerchunk_file, engine="kerchunk", decode_times=True
)
xrt.assert_allclose(ds, roundtrip)

def test_filters_netcdf4_roundtrip(
self, tmpdir, filter_encoded_roundtrip_netcdf4_file
self, tmp_path, filter_encoded_roundtrip_netcdf4_file
):
filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
ds = xr.open_dataset(filepath)
vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)
with (
xr.open_dataset(filepath) as ds,
virtualizarr.open_virtual_dataset(
filepath, backend=HDFVirtualBackend
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)

def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
vds = virtualizarr.open_virtual_dataset(
filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
)
kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_allclose(ds, roundtrip)
def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file):
with (
xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds,
virtualizarr.open_virtual_dataset(
filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
) as vds,
):
kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_allclose(ds, roundtrip)
assert (
ds["temperature"].encoding["_FillValue"]
== roundtrip["temperature"].encoding["_FillValue"]
)

def test_non_coord_dim(self, tmpdir, non_coord_dim):
ds = xr.open_dataset(non_coord_dim)
vds = virtualizarr.open_virtual_dataset(
non_coord_dim, backend=HDFVirtualBackend
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)
def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim):
with (
xr.open_dataset(non_coord_dim) as ds,
virtualizarr.open_virtual_dataset(
non_coord_dim, backend=HDFVirtualBackend
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)

@requires_icechunk
def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file):
with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds:
if ds["data"].dtype in [float, object]:
pytest.xfail(
"TODO: fix handling fixed-length and structured type fill value"
" encoding in xarray zarr backend."
)
with virtualizarr.open_virtual_dataset(
cf_fill_value_hdf5_file, backend=HDFVirtualBackend
) as vds:
roundtrip = roundtrip_as_in_memory_icechunk(
vds, tmp_path, decode_times=False
)
xrt.assert_equal(ds, roundtrip)