diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8ffab2889f..cf9f33890c7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -163,6 +163,8 @@ New Features - ``compute=False`` is now supported by :py:meth:`DataTree.to_netcdf` and :py:meth:`DataTree.to_zarr`. By `Stephan Hoyer `_. +- The ``h5netcdf`` engine has support for pseudo ``NETCDF4_CLASSIC`` files, meaning variables and attributes are cast to supported types. Note that the saved files won't be recognized as genuine ``NETCDF4_CLASSIC`` files until ``h5netcdf`` adds support. (:issue:`10676`, :pull:`10686`). + By `David Huard `_. - ``open_dataset`` will now correctly infer a path ending in ``.zarr/`` as zarr By `Ian Hunt-Isaak `_. diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 28565f92de9..3c4f77c6035 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Self import numpy as np +from packaging.version import Version from xarray.backends.common import ( BACKEND_ENTRYPOINTS, @@ -27,6 +28,7 @@ PickleableFileManager, ) from xarray.backends.locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock +from xarray.backends.netcdf3 import encode_nc3_attr_value, encode_nc3_variable from xarray.backends.netCDF4_ import ( BaseNetCDF4Array, _build_and_get_enum, @@ -124,6 +126,7 @@ def __init__( manager: FileManager | h5netcdf.File | h5netcdf.Group, group=None, mode=None, + format="NETCDF4", lock=HDF5_LOCK, autoclose=False, ): @@ -143,7 +146,7 @@ def __init__( self._manager = manager self._group = group self._mode = mode - self.format = None + self.format = format or "NETCDF4" # todo: utilizing find_root_and_group seems a bit clunky # making filename available on h5netcdf.Group seems better self._filename = find_root_and_group(self.ds)[0].filename @@ -152,6 +155,9 @@ def __init__( self.autoclose = autoclose def get_child_store(self, group: str) -> Self: + if self.format == "NETCDF4_CLASSIC": + raise ValueError("Cannot create sub-groups in `NETCDF4_CLASSIC` format.") + if self._group is not None: group = os.path.join(self._group, group) return type(self)( @@ -167,7 +173,7 @@ def open( cls, filename, mode="r", - format=None, + format="NETCDF4", group=None, lock=None, autoclose=False, @@ -198,8 +204,8 @@ def open( f"{magic_number!r} is not the signature of a valid netCDF4 file" ) - if format not in [None, "NETCDF4"]: - raise ValueError("invalid format for h5netcdf backend") + if format not in [None, "NETCDF4", "NETCDF4_CLASSIC"]: + raise ValueError(f"invalid format for h5netcdf backend: {format}") kwargs = { "invalid_netcdf": invalid_netcdf, @@ -210,6 +216,8 @@ def open( kwargs.update(driver_kwds) if phony_dims is not None: kwargs["phony_dims"] = phony_dims + if format is not None and Version(h5netcdf.__version__) > Version("1.6.4"): + kwargs["format"] = format if lock is None: if mode == "r": @@ -223,7 +231,15 @@ def open( else PickleableFileManager ) manager = manager_cls(h5netcdf.File, filename, mode=mode, kwargs=kwargs) - return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) + + return cls( + manager, + group=group, + format=format, + mode=mode, + lock=lock, + autoclose=autoclose, + ) def _acquire(self, needs_lock=True): with self._manager.acquire_context(needs_lock) as root: @@ -319,11 +335,27 @@ def set_dimension(self, name, length, is_unlimited=False): else: self.ds.dimensions[name] = length + def convert_string(self, value): + """If format is NETCDF4_CLASSIC, convert strings to fixed width char + arrays to ensure they can be read by legacy software. + + CLASSIC attributes are read by third party software as fixed width char arrays + """ + if self.format == "NETCDF4_CLASSIC": + value = encode_nc3_attr_value(value) + if isinstance(value, bytes): + value = np.bytes_(value) + return value + def set_attribute(self, key, value): + value = self.convert_string(value) self.ds.attrs[key] = value def encode_variable(self, variable, name=None): - return _encode_nc4_variable(variable, name=name) + if self.format == "NETCDF4_CLASSIC": + return encode_nc3_variable(variable, name=name) + else: + return _encode_nc4_variable(variable, name=name) def prepare_variable( self, name, variable, check_encoding=False, unlimited_dims=None @@ -332,7 +364,9 @@ def prepare_variable( _ensure_no_forward_slash_in_name(name) attrs = variable.attrs.copy() - dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) + dtype = _get_datatype( + variable, nc_format=self.format, raise_on_invalid_encoding=check_encoding + ) fillvalue = attrs.pop("_FillValue", None) @@ -394,7 +428,7 @@ def prepare_variable( nc4_var = self.ds[name] for k, v in attrs.items(): - nc4_var.attrs[k] = v + nc4_var.attrs[k] = self.convert_string(v) target = H5NetCDFArrayWrapper(name, self) @@ -484,7 +518,7 @@ def open_dataset( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - format=None, + format="NETCDF4", group=None, lock=None, invalid_netcdf=None, @@ -544,7 +578,7 @@ def open_datatree( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - format=None, + format="NETCDF4", group: str | None = None, lock=None, invalid_netcdf=None, @@ -587,7 +621,7 @@ def open_groups_as_dict( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - format=None, + format="NETCDF4", group: str | None = None, lock=None, invalid_netcdf=None, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7df9596b1ae..be66ed185ff 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -460,6 +460,7 @@ def roundtrip( save_kwargs = {} if open_kwargs is None: open_kwargs = {} + with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path: self.save(data, path, **save_kwargs) with self.open(path, **open_kwargs) as ds: @@ -4727,6 +4728,54 @@ def create_store(self): ) as store: yield store + @requires_h5netcdf + def test_string_attributes_stored_as_char(self, tmp_path): + import h5netcdf + + original = Dataset(attrs={"foo": "bar"}) + store_path = tmp_path / "tmp.nc" + original.to_netcdf(store_path, engine=self.engine, format=self.file_format) + with h5netcdf.File(store_path, "r") as ds: + # Check that the attribute is stored as a char array + assert ds._h5file.attrs["foo"].dtype == np.dtype("S3") + + +@requires_h5netcdf +class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data): + engine: T_NetcdfEngine = "h5netcdf" + file_format: T_NetcdfTypes = "NETCDF4_CLASSIC" + + @contextlib.contextmanager + def create_store(self): + with create_tmp_file() as tmp_file: + with backends.H5NetCDFStore.open( + tmp_file, mode="w", format="NETCDF4_CLASSIC" + ) as store: + yield store + + @requires_netCDF4 + def test_cross_engine_read_write_netcdf4(self) -> None: + # Drop dim3, because its labels include strings. These appear to be + # not properly read with python-netCDF4, which converts them into + # unicode instead of leaving them as bytes. + data = create_test_data().drop_vars("dim3") + data.attrs["foo"] = "bar" + valid_engines: list[T_NetcdfEngine] = ["netcdf4", "h5netcdf"] + for write_engine in valid_engines: + with create_tmp_file() as tmp_file: + data.to_netcdf(tmp_file, engine=write_engine, format=self.file_format) + for read_engine in valid_engines: + with open_dataset(tmp_file, engine=read_engine) as actual: + assert_identical(data, actual) + + def test_group_fails(self): + # Check writing group data fails with CLASSIC format + original = create_test_data() + with pytest.raises( + ValueError, match=r"Cannot create sub-groups in `NETCDF4_CLASSIC` format." + ): + original.to_netcdf(group="sub", format=self.file_format, engine=self.engine) + @requires_scipy_or_netCDF4 class TestGenericNetCDFData(NetCDF3Only, CFEncodedBase):