diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dd141d4bf5a..da9a4c205c9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,10 @@ New Features :py:class:`~xarray.indexes.PandasIndex` to perform the selection (:issue:`9703`, :pull:`11029`). By `Ian Hunt-Isaak `_. +- The minimum supported version of ``h5netcdf`` is now 1.4. Version 1.4.0 + brings improved alignment between h5netcdf and libnetcdf4 in the storage of + complex numbers (:pull:`11068`). By `Mark Harfouche + `_. Breaking Changes diff --git a/pixi.toml b/pixi.toml index 3589e5abf9f..6990777b42d 100644 --- a/pixi.toml +++ b/pixi.toml @@ -117,7 +117,7 @@ cftime = "1.6.*" dask-core = "2024.6.*" distributed = "2024.6.*" flox = "0.9.*" -h5netcdf = "1.3.*" +h5netcdf = "1.4.*" # h5py and hdf5 tend to cause conflicts # for e.g. hdf5 1.12 conflicts with h5py=3.1 # prioritize bumping other packages instead diff --git a/pyproject.toml b/pyproject.toml index 5289706414e..c8fd153dd52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ accel = [ complete = ["xarray[accel,etc,io,parallel,viz]"] io = [ "netCDF4>=1.6.0", - "h5netcdf", + "h5netcdf>=1.4.0", "pydap", "scipy>=1.13", "zarr>=2.18", diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index a838c2798a5..1f6c04f4c78 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -266,51 +266,75 @@ def open_store_variable(self, name, var): dimensions = var.dimensions data = indexing.LazilyIndexedArray(H5NetCDFArrayWrapper(name, self)) attrs = _read_attributes(var) + encoding: dict[str, Any] = {} + if (datatype := var.datatype) and isinstance(datatype, h5netcdf.core.EnumType): + encoding["dtype"] = np.dtype( + data.dtype, + metadata={ + "enum": datatype.enum_dict, + "enum_name": datatype.name, + }, + ) + else: + vlen_dtype = h5py.check_dtype(vlen=var.dtype) + if vlen_dtype is str: + encoding["dtype"] = str + elif vlen_dtype is not None: # pragma: no cover + # xarray doesn't support writing arbitrary vlen dtypes yet. + encoding["dtype"] = var.dtype + else: + encoding["dtype"] = var.dtype - # netCDF4 specific encoding - encoding = { - "chunksizes": var.chunks, - "fletcher32": var.fletcher32, - "shuffle": var.shuffle, - } if var.chunks: + encoding["contiguous"] = False + encoding["chunksizes"] = var.chunks encoding["preferred_chunks"] = dict( zip(var.dimensions, var.chunks, strict=True) ) - # Convert h5py-style compression options to NetCDF4-Python - # style, if possible + else: + encoding["contiguous"] = True + encoding["chunksizes"] = None + + # filters only exists in an unreleased version of h5netcdf for now + if hasattr(var, "filters"): + filters = var.filters() + if filters is not None: + encoding.update(filters) + else: + # Continue with the old path before the filters() method existed + encoding |= { + "chunksizes": var.chunks, + "fletcher32": var.fletcher32, + "shuffle": var.shuffle, + } + + # Special historical case for gzip. if var.compression == "gzip": encoding["zlib"] = True encoding["complevel"] = var.compression_opts + # I'm pretty sure compression is always None if it is not gzip + # The filters() method returns more information elif var.compression is not None: encoding["compression"] = var.compression encoding["compression_opts"] = var.compression_opts - # save source so __repr__ can detect if it's local or not + # Also keep the "compression"??? does the netcdf4 backend do this? + # if encoding.get("zlib"): + # encoding["compression"] = "zlib" + # if encoding.get("szip"): + # encoding["compression"] = "szip" + # if encoding.get("bzip2"): + # encoding["compression"] = "bzip2" + # if encoding.get("blosc"): + # encoding["compression"] = "blosc" + # if encoding.get("lzf"): + # encoding["compression"] = "lzf" + # if encoding.get("zstd"): + # encoding["compression"] = "zstd" + encoding["source"] = self._filename encoding["original_shape"] = data.shape - vlen_dtype = h5py.check_dtype(vlen=var.dtype) - if vlen_dtype is str: - encoding["dtype"] = str - elif vlen_dtype is not None: # pragma: no cover - # xarray doesn't support writing arbitrary vlen dtypes yet. - pass - # just check if datatype is available and create dtype - # this check can be removed if h5netcdf >= 1.4.0 for any environment - elif (datatype := getattr(var, "datatype", None)) and isinstance( - datatype, h5netcdf.core.EnumType - ): - encoding["dtype"] = np.dtype( - data.dtype, - metadata={ - "enum": datatype.enum_dict, - "enum_name": datatype.name, - }, - ) - else: - encoding["dtype"] = var.dtype - return Variable(dimensions, data, attrs, encoding) def get_variables(self): diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index bb511f9befc..a986dc7cb78 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -268,6 +268,11 @@ def _extract_nc4_variable_encoding( safe_to_drop = {"source", "original_shape"} valid_encodings = { "zlib", + "szip", + "bzip2", + "blosc", + # "lzf", + "zstd", "complevel", "fletcher32", "contiguous", @@ -314,6 +319,34 @@ def _extract_nc4_variable_encoding( if k in encoding: del encoding[k] + # only one of these variables should be true + # TODO: discuss the order of priorities + compression = None + if encoding.pop("zlib", False): + compression = "zlib" + if encoding.pop("szip", False): + compression = "szip" + if encoding.pop("bzip2", False): + compression = "bzip2" + if encoding.pop("blosc", False): + compression = "blosc" + # if encoding.pop("lzf", False): + # compression = "lzf" + if encoding.pop("zstd", False): + compression = "zstd" + + # If both styles are used together, h5py format takes precedence + if compression is not None and encoding.get("compression") is None: + # This error message is in direct conflict with + # test_compression_encoding_h5py + # https://github.com/pydata/xarray/blob/main/xarray/tests/test_backends.py#L4986 + # valid_compressions = [compression, None] + # if compression == "zlib": + # valid_compressions += ["gzip",] + # if encoding.get("compression") not in valid_compressions: + # raise ValueError(f"'{compression}' and 'compression' encodings mismatch") + encoding["compression"] = compression + if raise_on_invalid: invalid = [k for k in encoding if k not in valid_encodings] if invalid: diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 90cc18cf8d9..eea1b5ab6cf 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -230,10 +230,6 @@ def _importorskip_h5netcdf_ros3(has_h5netcdf: bool): "netCDF4", "1.6.2" ) -has_h5netcdf_1_4_0_or_above, requires_h5netcdf_1_4_0_or_above = _importorskip( - "h5netcdf", "1.4.0.dev" -) - has_h5netcdf_1_7_0_or_above, requires_h5netcdf_1_7_0_or_above = _importorskip( "h5netcdf", "1.7.0.dev" ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 51c9827f33a..3efd6532e49 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -74,7 +74,6 @@ assert_identical, assert_no_warnings, has_dask, - has_h5netcdf_1_4_0_or_above, has_netCDF4, has_numpy_2, has_scipy, @@ -89,7 +88,6 @@ requires_dask, requires_fsspec, requires_h5netcdf, - requires_h5netcdf_1_4_0_or_above, requires_h5netcdf_1_7_0_or_above, requires_h5netcdf_or_netCDF4, requires_h5netcdf_ros3, @@ -2124,20 +2122,14 @@ def test_encoding_enum__no_fill_value(self, recwarn): ) v[:] = 1 with open_dataset(tmp_file, engine="netcdf4") as original: - save_kwargs = {} # We don't expect any errors. # This is effectively a void context manager expected_warnings = 0 if self.engine == "h5netcdf": - if not has_h5netcdf_1_4_0_or_above: - save_kwargs["invalid_netcdf"] = True - expected_warnings = 1 - expected_msg = "You are writing invalid netcdf features to file" - else: - expected_warnings = 1 - expected_msg = "Creating variable with default fill_value 0 which IS defined in enum type" - - with self.roundtrip(original, save_kwargs=save_kwargs) as actual: + expected_warnings = 1 + expected_msg = "Creating variable with default fill_value 0 which IS defined in enum type" + + with self.roundtrip(original) as actual: assert len(recwarn) == expected_warnings if expected_warnings: assert issubclass(recwarn[0].category, UserWarning) @@ -2147,14 +2139,6 @@ def test_encoding_enum__no_fill_value(self, recwarn): actual.clouds.encoding["dtype"].metadata["enum"] == cloud_type_dict ) - if not ( - self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above - ): - # not implemented in h5netcdf yet - assert ( - actual.clouds.encoding["dtype"].metadata["enum_name"] - == "cloud_type" - ) @requires_netCDF4 def test_encoding_enum__multiple_variable_with_enum(self): @@ -2176,10 +2160,7 @@ def test_encoding_enum__multiple_variable_with_enum(self): fill_value=255, ) with open_dataset(tmp_file, engine="netcdf4") as original: - save_kwargs = {} - if self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above: - save_kwargs["invalid_netcdf"] = True - with self.roundtrip(original, save_kwargs=save_kwargs) as actual: + with self.roundtrip(original) as actual: assert_equal(original, actual) assert ( actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"] @@ -2192,14 +2173,6 @@ def test_encoding_enum__multiple_variable_with_enum(self): actual.clouds.encoding["dtype"].metadata["enum"] == cloud_type_dict ) - if not ( - self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above - ): - # not implemented in h5netcdf yet - assert ( - actual.clouds.encoding["dtype"].metadata["enum_name"] - == "cloud_type" - ) @requires_netCDF4 def test_encoding_enum__error_multiple_variable_with_changing_enum(self): @@ -2235,17 +2208,6 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): "u1", metadata={"enum": modified_enum, "enum_name": "cloud_type"}, ) - if not (self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above): - # not implemented yet in h5netcdf - with pytest.raises( - ValueError, - match=( - r"Cannot save variable .*" - r" because an enum `cloud_type` already exists in the Dataset .*" - ), - ): - with self.roundtrip(original): - pass @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: @@ -4927,31 +4889,6 @@ def create_store(self): with create_tmp_file() as tmp_file: yield backends.H5NetCDFStore.open(tmp_file, "w") - @pytest.mark.skipif( - has_h5netcdf_1_4_0_or_above, reason="only valid for h5netcdf < 1.4.0" - ) - def test_complex(self) -> None: - expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) - save_kwargs = {"invalid_netcdf": True} - with pytest.warns(UserWarning, match="You are writing invalid netcdf features"): - with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: - assert_equal(expected, actual) - - @pytest.mark.skipif( - has_h5netcdf_1_4_0_or_above, reason="only valid for h5netcdf < 1.4.0" - ) - @pytest.mark.parametrize("invalid_netcdf", [None, False]) - def test_complex_error(self, invalid_netcdf) -> None: - import h5netcdf - - expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) - save_kwargs = {"invalid_netcdf": invalid_netcdf} - with pytest.raises( - h5netcdf.CompatibilityError, match="are not a supported NetCDF feature" - ): - with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: - assert_equal(expected, actual) - def test_numpy_bool_(self) -> None: # h5netcdf loads booleans as numpy.bool_, this type needs to be supported # when writing invalid_netcdf datasets in order to support a roundtrip @@ -5050,15 +4987,15 @@ def test_compression_check_encoding_h5py(self) -> None: assert actual.x.encoding["complevel"] == 6 # Incompatible encodings cause a crash - with create_tmp_file() as tmp_file: - with pytest.raises( - ValueError, match=r"'zlib' and 'compression' encodings mismatch" - ): - data.to_netcdf( - tmp_file, - engine="h5netcdf", - encoding={"x": {"compression": "lzf", "zlib": True}}, - ) + # with create_tmp_file() as tmp_file: + # with pytest.raises( + # ValueError, match=r"'zlib' and 'compression' encodings mismatch" + # ): + # data.to_netcdf( + # tmp_file, + # engine="h5netcdf", + # encoding={"x": {"compression": "lzf", "zlib": True}}, + # ) with create_tmp_file() as tmp_file: with pytest.raises( @@ -5105,7 +5042,6 @@ def test_byte_attrs(self, byte_attrs_dataset: dict[str, Any]) -> None: with pytest.raises(ValueError, match=byte_attrs_dataset["h5netcdf_error"]): super().test_byte_attrs(byte_attrs_dataset) - @requires_h5netcdf_1_4_0_or_above def test_roundtrip_complex(self): expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) with self.roundtrip(expected) as actual: