Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ New Features
:py:class:`~xarray.indexes.PandasIndex` to perform the selection
(:issue:`9703`, :pull:`11029`).
By `Ian Hunt-Isaak <https://github.com/ianhi>`_.
- The minimum supported version of ``h5netcdf`` is now 1.4. Version 1.4.0
brings improved alignment between h5netcdf and libnetcdf4 in the storage of
complex numbers (:pull:`11068`). By `Mark Harfouche
<https://github.com/hmaarrfk>`_.


Breaking Changes
Expand Down
2 changes: 1 addition & 1 deletion pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ cftime = "1.6.*"
dask-core = "2024.6.*"
distributed = "2024.6.*"
flox = "0.9.*"
h5netcdf = "1.3.*"
h5netcdf = "1.4.*"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

undo if #11068 doesn't get accepted

# h5py and hdf5 tend to cause conflicts
# for e.g. hdf5 1.12 conflicts with h5py=3.1
# prioritize bumping other packages instead
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ accel = [
complete = ["xarray[accel,etc,io,parallel,viz]"]
io = [
"netCDF4>=1.6.0",
"h5netcdf",
"h5netcdf>=1.4.0",
"pydap",
"scipy>=1.13",
"zarr>=2.18",
Expand Down
84 changes: 54 additions & 30 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,51 +266,75 @@ def open_store_variable(self, name, var):
dimensions = var.dimensions
data = indexing.LazilyIndexedArray(H5NetCDFArrayWrapper(name, self))
attrs = _read_attributes(var)
encoding: dict[str, Any] = {}
if (datatype := var.datatype) and isinstance(datatype, h5netcdf.core.EnumType):
encoding["dtype"] = np.dtype(
data.dtype,
metadata={
"enum": datatype.enum_dict,
"enum_name": datatype.name,
},
)
else:
vlen_dtype = h5py.check_dtype(vlen=var.dtype)
if vlen_dtype is str:
encoding["dtype"] = str
elif vlen_dtype is not None: # pragma: no cover
# xarray doesn't support writing arbitrary vlen dtypes yet.
encoding["dtype"] = var.dtype
else:
encoding["dtype"] = var.dtype

# netCDF4 specific encoding
encoding = {
"chunksizes": var.chunks,
"fletcher32": var.fletcher32,
"shuffle": var.shuffle,
}
if var.chunks:
encoding["contiguous"] = False
encoding["chunksizes"] = var.chunks
encoding["preferred_chunks"] = dict(
zip(var.dimensions, var.chunks, strict=True)
)
# Convert h5py-style compression options to NetCDF4-Python
# style, if possible
else:
encoding["contiguous"] = True
encoding["chunksizes"] = None

# filters only exists in an unreleased version of h5netcdf for now
if hasattr(var, "filters"):
filters = var.filters()
if filters is not None:
encoding.update(filters)
else:
# Continue with the old path before the filters() method existed
encoding |= {
"chunksizes": var.chunks,
"fletcher32": var.fletcher32,
"shuffle": var.shuffle,
}

# Special historical case for gzip.
if var.compression == "gzip":
encoding["zlib"] = True
encoding["complevel"] = var.compression_opts
# I'm pretty sure compression is always None if it is not gzip
# The filters() method returns more information
elif var.compression is not None:
encoding["compression"] = var.compression
encoding["compression_opts"] = var.compression_opts

# save source so __repr__ can detect if it's local or not
# Also keep the "compression"??? does the netcdf4 backend do this?
# if encoding.get("zlib"):
# encoding["compression"] = "zlib"
# if encoding.get("szip"):
# encoding["compression"] = "szip"
# if encoding.get("bzip2"):
# encoding["compression"] = "bzip2"
# if encoding.get("blosc"):
# encoding["compression"] = "blosc"
# if encoding.get("lzf"):
# encoding["compression"] = "lzf"
# if encoding.get("zstd"):
# encoding["compression"] = "zstd"

encoding["source"] = self._filename
encoding["original_shape"] = data.shape

vlen_dtype = h5py.check_dtype(vlen=var.dtype)
if vlen_dtype is str:
encoding["dtype"] = str
elif vlen_dtype is not None: # pragma: no cover
# xarray doesn't support writing arbitrary vlen dtypes yet.
pass
# just check if datatype is available and create dtype
# this check can be removed if h5netcdf >= 1.4.0 for any environment
elif (datatype := getattr(var, "datatype", None)) and isinstance(
datatype, h5netcdf.core.EnumType
):
encoding["dtype"] = np.dtype(
data.dtype,
metadata={
"enum": datatype.enum_dict,
"enum_name": datatype.name,
},
)
else:
encoding["dtype"] = var.dtype

return Variable(dimensions, data, attrs, encoding)

def get_variables(self):
Expand Down
33 changes: 33 additions & 0 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ def _extract_nc4_variable_encoding(
safe_to_drop = {"source", "original_shape"}
valid_encodings = {
"zlib",
"szip",
"bzip2",
"blosc",
# "lzf",
"zstd",
"complevel",
"fletcher32",
"contiguous",
Expand Down Expand Up @@ -314,6 +319,34 @@ def _extract_nc4_variable_encoding(
if k in encoding:
del encoding[k]

# only one of these variables should be true
# TODO: discuss the order of priorities
compression = None
if encoding.pop("zlib", False):
compression = "zlib"
if encoding.pop("szip", False):
compression = "szip"
if encoding.pop("bzip2", False):
compression = "bzip2"
if encoding.pop("blosc", False):
compression = "blosc"
# if encoding.pop("lzf", False):
# compression = "lzf"
if encoding.pop("zstd", False):
compression = "zstd"

# If both styles are used together, h5py format takes precedence
if compression is not None and encoding.get("compression") is None:
# This error message is in direct conflict with
# test_compression_encoding_h5py
# https://github.com/pydata/xarray/blob/main/xarray/tests/test_backends.py#L4986
# valid_compressions = [compression, None]
# if compression == "zlib":
# valid_compressions += ["gzip",]
# if encoding.get("compression") not in valid_compressions:
# raise ValueError(f"'{compression}' and 'compression' encodings mismatch")
encoding["compression"] = compression

if raise_on_invalid:
invalid = [k for k in encoding if k not in valid_encodings]
if invalid:
Expand Down
4 changes: 0 additions & 4 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,6 @@ def _importorskip_h5netcdf_ros3(has_h5netcdf: bool):
"netCDF4", "1.6.2"
)

has_h5netcdf_1_4_0_or_above, requires_h5netcdf_1_4_0_or_above = _importorskip(
"h5netcdf", "1.4.0.dev"
)

has_h5netcdf_1_7_0_or_above, requires_h5netcdf_1_7_0_or_above = _importorskip(
"h5netcdf", "1.7.0.dev"
)
Expand Down
92 changes: 14 additions & 78 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@
assert_identical,
assert_no_warnings,
has_dask,
has_h5netcdf_1_4_0_or_above,
has_netCDF4,
has_numpy_2,
has_scipy,
Expand All @@ -89,7 +88,6 @@
requires_dask,
requires_fsspec,
requires_h5netcdf,
requires_h5netcdf_1_4_0_or_above,
requires_h5netcdf_1_7_0_or_above,
requires_h5netcdf_or_netCDF4,
requires_h5netcdf_ros3,
Expand Down Expand Up @@ -1875,7 +1873,7 @@
# regression test for #709
ds = Dataset({"x": ("y", np.arange(10.0))})
kwargs = dict(encoding={"x": {"zlib": True}})
with self.roundtrip(ds, save_kwargs=kwargs) as actual:

Check failure on line 1876 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFViaDaskData.test_dump_encodings ValueError: Compression filter "zlib" is unavailable

Check failure on line 1876 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFFileObject.test_dump_encodings ValueError: Compression filter "zlib" is unavailable

Check failure on line 1876 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFData.test_dump_encodings ValueError: Compression filter "zlib" is unavailable
assert actual.x.encoding["zlib"]

def test_dump_and_open_encodings(self) -> None:
Expand Down Expand Up @@ -1908,7 +1906,7 @@
"original_shape": data.var2.shape,
}
)
with self.roundtrip(data) as actual:

Check failure on line 1909 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFViaDaskData.test_compression_encoding_legacy ValueError: Compression filter "zlib" is unavailable

Check failure on line 1909 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFFileObject.test_compression_encoding_legacy ValueError: Compression filter "zlib" is unavailable

Check failure on line 1909 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFData.test_compression_encoding_legacy ValueError: Compression filter "zlib" is unavailable
for k, v in data["var2"].encoding.items():
assert v == actual["var2"].encoding[k]

Expand All @@ -1929,7 +1927,7 @@
)
kwargs = dict(encoding=dict(x=encoding))

with self.roundtrip(ds, save_kwargs=kwargs) as actual:

Check failure on line 1930 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFViaDaskData.test_encoding_kwarg_compression ValueError: Compression filter "zlib" is unavailable

Check failure on line 1930 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFFileObject.test_encoding_kwarg_compression ValueError: Compression filter "zlib" is unavailable

Check failure on line 1930 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py311-min-versions

TestH5NetCDFData.test_encoding_kwarg_compression ValueError: Compression filter "zlib" is unavailable
assert_equal(actual, ds)
assert actual.x.encoding["dtype"] == "f4"
assert actual.x.encoding["zlib"]
Expand Down Expand Up @@ -2124,20 +2122,14 @@
)
v[:] = 1
with open_dataset(tmp_file, engine="netcdf4") as original:
save_kwargs = {}
# We don't expect any errors.
# This is effectively a void context manager
expected_warnings = 0
if self.engine == "h5netcdf":
if not has_h5netcdf_1_4_0_or_above:
save_kwargs["invalid_netcdf"] = True
expected_warnings = 1
expected_msg = "You are writing invalid netcdf features to file"
else:
expected_warnings = 1
expected_msg = "Creating variable with default fill_value 0 which IS defined in enum type"

with self.roundtrip(original, save_kwargs=save_kwargs) as actual:
expected_warnings = 1
expected_msg = "Creating variable with default fill_value 0 which IS defined in enum type"

with self.roundtrip(original) as actual:
assert len(recwarn) == expected_warnings
if expected_warnings:
assert issubclass(recwarn[0].category, UserWarning)
Expand All @@ -2147,14 +2139,6 @@
actual.clouds.encoding["dtype"].metadata["enum"]
== cloud_type_dict
)
if not (
self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above
):
# not implemented in h5netcdf yet
assert (
actual.clouds.encoding["dtype"].metadata["enum_name"]
== "cloud_type"
)

@requires_netCDF4
def test_encoding_enum__multiple_variable_with_enum(self):
Expand All @@ -2176,10 +2160,7 @@
fill_value=255,
)
with open_dataset(tmp_file, engine="netcdf4") as original:
save_kwargs = {}
if self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above:
save_kwargs["invalid_netcdf"] = True
with self.roundtrip(original, save_kwargs=save_kwargs) as actual:
with self.roundtrip(original) as actual:
assert_equal(original, actual)
assert (
actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"]
Expand All @@ -2192,14 +2173,6 @@
actual.clouds.encoding["dtype"].metadata["enum"]
== cloud_type_dict
)
if not (
self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above
):
# not implemented in h5netcdf yet
assert (
actual.clouds.encoding["dtype"].metadata["enum_name"]
== "cloud_type"
)

@requires_netCDF4
def test_encoding_enum__error_multiple_variable_with_changing_enum(self):
Expand Down Expand Up @@ -2235,17 +2208,6 @@
"u1",
metadata={"enum": modified_enum, "enum_name": "cloud_type"},
)
if not (self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above):
# not implemented yet in h5netcdf
with pytest.raises(
ValueError,
match=(
r"Cannot save variable .*"
r" because an enum `cloud_type` already exists in the Dataset .*"
),
):
with self.roundtrip(original):
pass

@pytest.mark.parametrize("create_default_indexes", [True, False])
def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None:
Expand Down Expand Up @@ -4927,31 +4889,6 @@
with create_tmp_file() as tmp_file:
yield backends.H5NetCDFStore.open(tmp_file, "w")

@pytest.mark.skipif(
has_h5netcdf_1_4_0_or_above, reason="only valid for h5netcdf < 1.4.0"
)
def test_complex(self) -> None:
expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))})
save_kwargs = {"invalid_netcdf": True}
with pytest.warns(UserWarning, match="You are writing invalid netcdf features"):
with self.roundtrip(expected, save_kwargs=save_kwargs) as actual:
assert_equal(expected, actual)

@pytest.mark.skipif(
has_h5netcdf_1_4_0_or_above, reason="only valid for h5netcdf < 1.4.0"
)
@pytest.mark.parametrize("invalid_netcdf", [None, False])
def test_complex_error(self, invalid_netcdf) -> None:
import h5netcdf

expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))})
save_kwargs = {"invalid_netcdf": invalid_netcdf}
with pytest.raises(
h5netcdf.CompatibilityError, match="are not a supported NetCDF feature"
):
with self.roundtrip(expected, save_kwargs=save_kwargs) as actual:
assert_equal(expected, actual)

def test_numpy_bool_(self) -> None:
# h5netcdf loads booleans as numpy.bool_, this type needs to be supported
# when writing invalid_netcdf datasets in order to support a roundtrip
Expand Down Expand Up @@ -5050,15 +4987,15 @@
assert actual.x.encoding["complevel"] == 6

# Incompatible encodings cause a crash
with create_tmp_file() as tmp_file:
with pytest.raises(
ValueError, match=r"'zlib' and 'compression' encodings mismatch"
):
data.to_netcdf(
tmp_file,
engine="h5netcdf",
encoding={"x": {"compression": "lzf", "zlib": True}},
)
# with create_tmp_file() as tmp_file:
# with pytest.raises(
# ValueError, match=r"'zlib' and 'compression' encodings mismatch"
# ):
# data.to_netcdf(
# tmp_file,
# engine="h5netcdf",
# encoding={"x": {"compression": "lzf", "zlib": True}},
# )

with create_tmp_file() as tmp_file:
with pytest.raises(
Expand Down Expand Up @@ -5105,7 +5042,6 @@
with pytest.raises(ValueError, match=byte_attrs_dataset["h5netcdf_error"]):
super().test_byte_attrs(byte_attrs_dataset)

@requires_h5netcdf_1_4_0_or_above
def test_roundtrip_complex(self):
expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))})
with self.roundtrip(expected) as actual:
Expand Down
Loading