Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 11 additions & 49 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
List,
Mapping,
Optional,
Tuple,
Union,
)

import numpy as np
import xarray as xr
from xarray.backends.zarr import FillValueCoder

from virtualizarr.manifests import (
ChunkEntry,
Expand Down Expand Up @@ -42,20 +40,6 @@
H5Dataset: Any = None
H5Group: Any = None

FillValueType = Union[
int,
float,
bool,
complex,
str,
np.integer,
np.floating,
np.bool_,
np.complexfloating,
bytes, # For fixed-length string storage
Tuple[bytes, int], # Structured type
]


class HDFVirtualBackend(VirtualBackend):
@staticmethod
Expand Down Expand Up @@ -232,29 +216,6 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:

return [dim.removeprefix(group) for dim in dims]

@staticmethod
def _extract_cf_fill_value(
h5obj: Union[H5Dataset, H5Group],
) -> Optional[FillValueType]:
"""
Convert the _FillValue attribute from an HDF5 group or dataset into
encoding.

Parameters
----------
h5obj : h5py.Group or h5py.Dataset
An h5py group or dataset.
"""
fillvalue = None
for n, v in h5obj.attrs.items():
if n == "_FillValue":
if isinstance(v, np.ndarray) and v.size == 1:
fillvalue = v.item()
else:
fillvalue = v
fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type]
return fillvalue

@staticmethod
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
"""
Expand All @@ -279,14 +240,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
for n, v in h5obj.attrs.items():
if n in _HIDDEN_ATTRS:
continue
if n == "_FillValue":
continue
# Fix some attribute values to avoid JSON encoding exceptions...
if isinstance(v, bytes):
v = v.decode("utf-8") or " "
elif isinstance(v, (np.ndarray, np.number, np.bool_)):
if v.dtype.kind == "S":
v = v.astype(str)
if n == "_FillValue":
continue
elif v.size == 1:
v = v.flatten()[0]
if isinstance(v, (np.ndarray, np.number, np.bool_)):
Expand All @@ -297,6 +258,7 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
v = ""
if v == "DIMENSION_SCALE":
continue

attrs[n] = v
return attrs

Expand Down Expand Up @@ -328,19 +290,21 @@ def _dataset_to_variable(
codecs = codecs_from_dataset(dataset)
cfcodec = cfcodec_from_dataset(dataset)
attrs = HDFVirtualBackend._extract_attrs(dataset)
cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
attrs.pop("_FillValue", None)

if cfcodec:
codecs.insert(0, cfcodec["codec"])
dtype = cfcodec["target_dtype"]
attrs.pop("scale_factor", None)
attrs.pop("add_offset", None)
fill_value = cfcodec["codec"].decode(dataset.fillvalue)
else:
dtype = dataset.dtype

fill_value = dataset.fillvalue.item()

fill_value = dataset.fillvalue
if isinstance(fill_value, np.ndarray):
fill_value = fill_value[0]
if np.isnan(fill_value):
fill_value = float("nan")
if isinstance(fill_value, np.generic):
fill_value = fill_value.item()
filters = [codec.get_config() for codec in codecs]
zarray = ZArray(
chunks=chunks, # type: ignore
Expand All @@ -359,8 +323,6 @@ def _dataset_to_variable(
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
else:
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
if cf_fill_value is not None:
variable.encoding["_FillValue"] = cf_fill_value
return variable

@staticmethod
Expand Down
61 changes: 0 additions & 61 deletions virtualizarr/tests/test_readers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,64 +342,3 @@ def non_coord_dim(tmpdir):
ds = ds.drop_dims("dim3")
ds.to_netcdf(filepath, engine="netcdf4")
return filepath


@pytest.fixture
def scalar_fill_value_hdf5_file(tmpdir):
filepath = f"{tmpdir}/scalar_fill_value.nc"
f = h5py.File(filepath, "w")
data = np.random.randint(0, 10, size=(5))
fill_value = 42
f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value)
return filepath


compound_dtype = np.dtype(
[
("id", "i4"), # 4-byte integer
("temperature", "f4"), # 4-byte float
]
)

compound_data = np.array(
[
(1, 98.6),
(2, 101.3),
],
dtype=compound_dtype,
)

compound_fill = (-9999, -9999.0)

fill_values = [
{"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))},
{"fill_value": -9999.0, "data": np.random.random(5)},
{"fill_value": np.nan, "data": np.random.random(5)},
{"fill_value": False, "data": np.array([True, False, False, True, True])},
{"fill_value": "NaN", "data": np.array(["three"], dtype="S10")},
{"fill_value": compound_fill, "data": compound_data},
]


@pytest.fixture(params=fill_values)
def cf_fill_value_hdf5_file(tmpdir, request):
filepath = f"{tmpdir}/cf_fill_value.nc"
f = h5py.File(filepath, "w")
dset = f.create_dataset(name="data", data=request.param["data"], chunks=True)
dim_scale = f.create_dataset(
name="dim_scale", data=request.param["data"], chunks=True
)
dim_scale.make_scale()
dset.dims[0].attach_scale(dim_scale)
dset.attrs["_FillValue"] = request.param["fill_value"]
return filepath


@pytest.fixture
def cf_array_fill_value_hdf5_file(tmpdir):
filepath = f"{tmpdir}/cf_array_fill_value.nc"
f = h5py.File(filepath, "w")
data = np.random.random(5)
dset = f.create_dataset(name="data", data=data, chunks=True)
dset.attrs["_FillValue"] = np.array([np.nan])
return filepath
31 changes: 0 additions & 31 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from unittest.mock import patch

import h5py # type: ignore
import numpy as np
import pytest

from virtualizarr import open_virtual_dataset
Expand Down Expand Up @@ -112,36 +111,6 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
)
assert var.attrs["attribute_name"] == "attribute_name"

def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
f = h5py.File(scalar_fill_value_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
scalar_fill_value_hdf5_file, ds, group=""
)
assert var.data.zarray.fill_value == 42

def test_cf_fill_value(self, cf_fill_value_hdf5_file):
f = h5py.File(cf_fill_value_hdf5_file)
ds = f["data"]
if ds.dtype.kind in "S":
pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
if ds.dtype.names:
pytest.xfail(
"To fix, structured dtype fill value encoding for Zarr backend"
)
var = HDFVirtualBackend._dataset_to_variable(
cf_fill_value_hdf5_file, ds, group=""
)
assert "_FillValue" in var.encoding

def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
f = h5py.File(cf_array_fill_value_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
cf_array_fill_value_hdf5_file, ds, group=""
)
assert not isinstance(var.encoding["_FillValue"], np.ndarray)


@requires_hdf5plugin
@requires_imagecodecs
Expand Down
23 changes: 1 addition & 22 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
requires_hdf5plugin,
requires_icechunk,
requires_imagecodecs,
requires_kerchunk,
)
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk


@requires_kerchunk
Expand Down Expand Up @@ -55,12 +53,8 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_allclose(ds, roundtrip)
assert (
ds["temperature"].encoding["_FillValue"]
== roundtrip["temperature"].encoding["_FillValue"]
)

def test_non_coord_dim_roundtrip(self, tmpdir, non_coord_dim):
def test_non_coord_dim(self, tmpdir, non_coord_dim):
ds = xr.open_dataset(non_coord_dim)
vds = virtualizarr.open_virtual_dataset(
non_coord_dim, backend=HDFVirtualBackend
Expand All @@ -69,18 +63,3 @@ def test_non_coord_dim_roundtrip(self, tmpdir, non_coord_dim):
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)

@requires_icechunk
def test_cf_fill_value_roundtrip(self, tmpdir, cf_fill_value_hdf5_file):
ds = xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf")
if ds["data"].dtype in [float, object]:
pytest.xfail(
"To fix handle fixed-length and structured type fill value \
encoding in xarray zarr backend."
)
vds = virtualizarr.open_virtual_dataset(
cf_fill_value_hdf5_file,
backend=HDFVirtualBackend,
)
roundtrip = roundtrip_as_in_memory_icechunk(vds, tmpdir, decode_times=False)
xrt.assert_equal(ds, roundtrip)