pydata · hmaarrfk · Jan 3, 2026 · Jan 3, 2026 · hmaarrfk · Jan 3, 2026
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -23,6 +23,10 @@ New Features
   :py:class:`~xarray.indexes.PandasIndex` to perform the selection
   (:issue:`9703`, :pull:`11029`).
   By `Ian Hunt-Isaak <https://github.com/ianhi>`_.
+- The minimum supported version of ``h5netcdf`` is now 1.4. Version 1.4.0
+  brings improved alignment between h5netcdf and libnetcdf4 in the storage of
+  complex numbers (:pull:`11068`). By `Mark Harfouche
+  <https://github.com/hmaarrfk>`_.
 
 
 Breaking Changes

diff --git a/pixi.toml b/pixi.toml
@@ -117,7 +117,7 @@ cftime = "1.6.*"
 dask-core = "2024.6.*"
 distributed = "2024.6.*"
 flox = "0.9.*"
-h5netcdf = "1.3.*"
+h5netcdf = "1.4.*"
 # h5py and hdf5 tend to cause conflicts
 # for e.g. hdf5 1.12 conflicts with h5py=3.1
 # prioritize bumping other packages instead

diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ accel = [
 complete = ["xarray[accel,etc,io,parallel,viz]"]
 io = [
   "netCDF4>=1.6.0",
-  "h5netcdf",
+  "h5netcdf>=1.4.0",
   "pydap",
   "scipy>=1.13",
   "zarr>=2.18",

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -266,51 +266,75 @@ def open_store_variable(self, name, var):
         dimensions = var.dimensions
         data = indexing.LazilyIndexedArray(H5NetCDFArrayWrapper(name, self))
         attrs = _read_attributes(var)
+        encoding: dict[str, Any] = {}
+        if (datatype := var.datatype) and isinstance(datatype, h5netcdf.core.EnumType):
+            encoding["dtype"] = np.dtype(
+                data.dtype,
+                metadata={
+                    "enum": datatype.enum_dict,
+                    "enum_name": datatype.name,
+                },
+            )
+        else:
+            vlen_dtype = h5py.check_dtype(vlen=var.dtype)
+            if vlen_dtype is str:
+                encoding["dtype"] = str
+            elif vlen_dtype is not None:  # pragma: no cover
+                # xarray doesn't support writing arbitrary vlen dtypes yet.
+                encoding["dtype"] = var.dtype
+            else:
+                encoding["dtype"] = var.dtype
 
-        # netCDF4 specific encoding
-        encoding = {
-            "chunksizes": var.chunks,
-            "fletcher32": var.fletcher32,
-            "shuffle": var.shuffle,
-        }
         if var.chunks:
+            encoding["contiguous"] = False
+            encoding["chunksizes"] = var.chunks
             encoding["preferred_chunks"] = dict(
                 zip(var.dimensions, var.chunks, strict=True)
             )
-        # Convert h5py-style compression options to NetCDF4-Python
-        # style, if possible
+        else:
+            encoding["contiguous"] = True
+            encoding["chunksizes"] = None
+
+        # filters only exists in an unreleased version of h5netcdf for now
+        if hasattr(var, "filters"):
+            filters = var.filters()
+            if filters is not None:
+                encoding.update(filters)
+        else:
+            # Continue with the old path before the filters() method existed
+            encoding |= {
+                "chunksizes": var.chunks,
+                "fletcher32": var.fletcher32,
+                "shuffle": var.shuffle,
+            }
+
+        # Special historical case for gzip.
         if var.compression == "gzip":
             encoding["zlib"] = True
             encoding["complevel"] = var.compression_opts
+        # I'm pretty sure compression is always None if it is not gzip
+        # The filters() method returns more information
         elif var.compression is not None:
             encoding["compression"] = var.compression
             encoding["compression_opts"] = var.compression_opts
 
-        # save source so __repr__ can detect if it's local or not
+        # Also keep the "compression"??? does the netcdf4 backend do this?
+        # if encoding.get("zlib"):
+        #     encoding["compression"] = "zlib"
+        # if encoding.get("szip"):
+        #     encoding["compression"] = "szip"
+        # if encoding.get("bzip2"):
+        #     encoding["compression"] = "bzip2"
+        # if encoding.get("blosc"):
+        #     encoding["compression"] = "blosc"
+        # if encoding.get("lzf"):
+        #     encoding["compression"] = "lzf"
+        # if encoding.get("zstd"):
+        #     encoding["compression"] = "zstd"
+
         encoding["source"] = self._filename
         encoding["original_shape"] = data.shape
 
-        vlen_dtype = h5py.check_dtype(vlen=var.dtype)
-        if vlen_dtype is str:
-            encoding["dtype"] = str
-        elif vlen_dtype is not None:  # pragma: no cover
-            # xarray doesn't support writing arbitrary vlen dtypes yet.
-            pass
-        # just check if datatype is available and create dtype
-        # this check can be removed if h5netcdf >= 1.4.0 for any environment
-        elif (datatype := getattr(var, "datatype", None)) and isinstance(
-            datatype, h5netcdf.core.EnumType
-        ):
-            encoding["dtype"] = np.dtype(
-                data.dtype,
-                metadata={
-                    "enum": datatype.enum_dict,
-                    "enum_name": datatype.name,
-                },
-            )
-        else:
-            encoding["dtype"] = var.dtype
-
         return Variable(dimensions, data, attrs, encoding)
 
     def get_variables(self):

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -268,6 +268,11 @@ def _extract_nc4_variable_encoding(
     safe_to_drop = {"source", "original_shape"}
     valid_encodings = {
         "zlib",
+        "szip",
+        "bzip2",
+        "blosc",
+        # "lzf",
+        "zstd",
         "complevel",
         "fletcher32",
         "contiguous",
@@ -314,6 +319,34 @@ def _extract_nc4_variable_encoding(
         if k in encoding:
             del encoding[k]
 
+    # only one of these variables should be true
+    # TODO: discuss the order of priorities
+    compression = None
+    if encoding.pop("zlib", False):
+        compression = "zlib"
+    if encoding.pop("szip", False):
+        compression = "szip"
+    if encoding.pop("bzip2", False):
+        compression = "bzip2"
+    if encoding.pop("blosc", False):
+        compression = "blosc"
+    # if encoding.pop("lzf", False):
+    #     compression = "lzf"
+    if encoding.pop("zstd", False):
+        compression = "zstd"
+
+    # If both styles are used together, h5py format takes precedence
+    if compression is not None and encoding.get("compression") is None:
+        # This error message is in direct conflict with
+        # test_compression_encoding_h5py
+        # https://github.com/pydata/xarray/blob/main/xarray/tests/test_backends.py#L4986
+        # valid_compressions = [compression, None]
+        # if compression == "zlib":
+        #     valid_compressions += ["gzip",]
+        # if encoding.get("compression") not in valid_compressions:
+        #     raise ValueError(f"'{compression}' and 'compression' encodings mismatch")
+        encoding["compression"] = compression
+
     if raise_on_invalid:
         invalid = [k for k in encoding if k not in valid_encodings]
         if invalid:

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -230,10 +230,6 @@ def _importorskip_h5netcdf_ros3(has_h5netcdf: bool):
     "netCDF4", "1.6.2"
 )
 
-has_h5netcdf_1_4_0_or_above, requires_h5netcdf_1_4_0_or_above = _importorskip(
-    "h5netcdf", "1.4.0.dev"
-)
-
 has_h5netcdf_1_7_0_or_above, requires_h5netcdf_1_7_0_or_above = _importorskip(
     "h5netcdf", "1.7.0.dev"
 )

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -74,7 +74,6 @@
     assert_identical,
     assert_no_warnings,
     has_dask,
-    has_h5netcdf_1_4_0_or_above,
     has_netCDF4,
     has_numpy_2,
     has_scipy,
@@ -89,7 +88,6 @@
     requires_dask,
     requires_fsspec,
     requires_h5netcdf,
-    requires_h5netcdf_1_4_0_or_above,
     requires_h5netcdf_1_7_0_or_above,
     requires_h5netcdf_or_netCDF4,
     requires_h5netcdf_ros3,
@@ -1875,7 +1873,7 @@
        # regression test for #709
        ds = Dataset({"x": ("y", np.arange(10.0))})
        kwargs = dict(encoding={"x": {"zlib": True}})
        with self.roundtrip(ds, save_kwargs=kwargs) as actual:
            assert actual.x.encoding["zlib"]

    def test_dump_and_open_encodings(self) -> None:
@@ -1908,7 +1906,7 @@
                "original_shape": data.var2.shape,
            }
        )
        with self.roundtrip(data) as actual:
            for k, v in data["var2"].encoding.items():
                assert v == actual["var2"].encoding[k]

@@ -1929,7 +1927,7 @@
        )
        kwargs = dict(encoding=dict(x=encoding))

        with self.roundtrip(ds, save_kwargs=kwargs) as actual:
            assert_equal(actual, ds)
            assert actual.x.encoding["dtype"] == "f4"
            assert actual.x.encoding["zlib"]
@@ -2124,20 +2122,14 @@
                 )
                 v[:] = 1
             with open_dataset(tmp_file, engine="netcdf4") as original:
-                save_kwargs = {}
                 # We don't expect any errors.
                 # This is effectively a void context manager
                 expected_warnings = 0
                 if self.engine == "h5netcdf":
-                    if not has_h5netcdf_1_4_0_or_above:
-                        save_kwargs["invalid_netcdf"] = True
-                        expected_warnings = 1
-                        expected_msg = "You are writing invalid netcdf features to file"
-                    else:
-                        expected_warnings = 1
-                        expected_msg = "Creating variable with default fill_value 0 which IS defined in enum type"
-
-                with self.roundtrip(original, save_kwargs=save_kwargs) as actual:
+                    expected_warnings = 1
+                    expected_msg = "Creating variable with default fill_value 0 which IS defined in enum type"
+
+                with self.roundtrip(original) as actual:
                     assert len(recwarn) == expected_warnings
                     if expected_warnings:
                         assert issubclass(recwarn[0].category, UserWarning)
@@ -2147,14 +2139,6 @@
                         actual.clouds.encoding["dtype"].metadata["enum"]
                         == cloud_type_dict
                     )
-                    if not (
-                        self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above
-                    ):
-                        # not implemented in h5netcdf yet
-                        assert (
-                            actual.clouds.encoding["dtype"].metadata["enum_name"]
-                            == "cloud_type"
-                        )
 
     @requires_netCDF4
     def test_encoding_enum__multiple_variable_with_enum(self):
@@ -2176,10 +2160,7 @@
                     fill_value=255,
                 )
             with open_dataset(tmp_file, engine="netcdf4") as original:
-                save_kwargs = {}
-                if self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above:
-                    save_kwargs["invalid_netcdf"] = True
-                with self.roundtrip(original, save_kwargs=save_kwargs) as actual:
+                with self.roundtrip(original) as actual:
                     assert_equal(original, actual)
                     assert (
                         actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"]
@@ -2192,14 +2173,6 @@
                         actual.clouds.encoding["dtype"].metadata["enum"]
                         == cloud_type_dict
                     )
-                    if not (
-                        self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above
-                    ):
-                        # not implemented in h5netcdf yet
-                        assert (
-                            actual.clouds.encoding["dtype"].metadata["enum_name"]
-                            == "cloud_type"
-                        )
 
     @requires_netCDF4
     def test_encoding_enum__error_multiple_variable_with_changing_enum(self):
@@ -2235,17 +2208,6 @@
                     "u1",
                     metadata={"enum": modified_enum, "enum_name": "cloud_type"},
                 )
-                if not (self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above):
-                    # not implemented yet in h5netcdf
-                    with pytest.raises(
-                        ValueError,
-                        match=(
-                            r"Cannot save variable .*"
-                            r" because an enum `cloud_type` already exists in the Dataset .*"
-                        ),
-                    ):
-                        with self.roundtrip(original):
-                            pass
 
     @pytest.mark.parametrize("create_default_indexes", [True, False])
     def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None:
@@ -4927,31 +4889,6 @@
         with create_tmp_file() as tmp_file:
             yield backends.H5NetCDFStore.open(tmp_file, "w")
 
-    @pytest.mark.skipif(
-        has_h5netcdf_1_4_0_or_above, reason="only valid for h5netcdf < 1.4.0"
-    )
-    def test_complex(self) -> None:
-        expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))})
-        save_kwargs = {"invalid_netcdf": True}
-        with pytest.warns(UserWarning, match="You are writing invalid netcdf features"):
-            with self.roundtrip(expected, save_kwargs=save_kwargs) as actual:
-                assert_equal(expected, actual)
-
-    @pytest.mark.skipif(
-        has_h5netcdf_1_4_0_or_above, reason="only valid for h5netcdf < 1.4.0"
-    )
-    @pytest.mark.parametrize("invalid_netcdf", [None, False])
-    def test_complex_error(self, invalid_netcdf) -> None:
-        import h5netcdf
-
-        expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))})
-        save_kwargs = {"invalid_netcdf": invalid_netcdf}
-        with pytest.raises(
-            h5netcdf.CompatibilityError, match="are not a supported NetCDF feature"
-        ):
-            with self.roundtrip(expected, save_kwargs=save_kwargs) as actual:
-                assert_equal(expected, actual)
-
     def test_numpy_bool_(self) -> None:
         # h5netcdf loads booleans as numpy.bool_, this type needs to be supported
         # when writing invalid_netcdf datasets in order to support a roundtrip
@@ -5050,15 +4987,15 @@
                 assert actual.x.encoding["complevel"] == 6
 
         # Incompatible encodings cause a crash
-        with create_tmp_file() as tmp_file:
-            with pytest.raises(
-                ValueError, match=r"'zlib' and 'compression' encodings mismatch"
-            ):
-                data.to_netcdf(
-                    tmp_file,
-                    engine="h5netcdf",
-                    encoding={"x": {"compression": "lzf", "zlib": True}},
-                )
+        # with create_tmp_file() as tmp_file:
+        #     with pytest.raises(
+        #         ValueError, match=r"'zlib' and 'compression' encodings mismatch"
+        #     ):
+        #         data.to_netcdf(
+        #             tmp_file,
+        #             engine="h5netcdf",
+        #             encoding={"x": {"compression": "lzf", "zlib": True}},
+        #         )
 
         with create_tmp_file() as tmp_file:
             with pytest.raises(
@@ -5105,7 +5042,6 @@
         with pytest.raises(ValueError, match=byte_attrs_dataset["h5netcdf_error"]):
             super().test_byte_attrs(byte_attrs_dataset)
 
-    @requires_h5netcdf_1_4_0_or_above
     def test_roundtrip_complex(self):
         expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))})
         with self.roundtrip(expected) as actual: