Skip to content

Commit 4722bf1

Browse files
authored
Change default netCDF engine to use h5netcdf and add netcdf_engine_order (#10755)
* Add option for netcdf_engine_order The default `engine` when reading/writing netCDF files is now h5netcdf or scipy, which are typically faster than the prior default of netCDF4-python. You can control this default behavior explicitly via the new `netcdf_engine_order` parameter in `set_options()`, e.g., `xr.set_options(netcdf_engine_order=['netcdf4', 'scipy', 'h5netcdf'])` to restore the prior defaults. I've also updated the documentation page which misled @lesserwhirls about Xarray supporting invalid netCDF files without `invalid_netcdf=True`. Fixes #10657 * Fix test failures * Automatically support NCZarr * Revert "Automatically support NCZarr" This reverts commit 18fe84f. * Reapply "Automatically support NCZarr" This reverts commit 4131449. * Fix mypy * spelling * Improve typing for _normalize_path() * hard code engine="netcdf4" for test_encoding_enum__no_fill_value * Fix reading netcdf3 files with open_datatree * Set engine in test_encoding_enum__multiple_variable_with_enum * set yet another test to only use netcdf4
1 parent a3bd20d commit 4722bf1

18 files changed

+219
-84
lines changed

doc/user-guide/io.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -591,8 +591,8 @@ The library ``h5netcdf`` allows writing some dtypes that aren't
591591
allowed in netCDF4 (see
592592
`h5netcdf documentation <https://github.com/h5netcdf/h5netcdf#invalid-netcdf-files>`_).
593593
This feature is available through :py:meth:`DataArray.to_netcdf` and
594-
:py:meth:`Dataset.to_netcdf` when used with ``engine="h5netcdf"``
595-
and currently raises a warning unless ``invalid_netcdf=True`` is set.
594+
:py:meth:`Dataset.to_netcdf` when used with ``engine="h5netcdf"``, only if
595+
``invalid_netcdf=True`` is explicitly set.
596596

597597
.. warning::
598598

doc/whats-new.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,12 @@ Breaking changes
2929
dataset in-place. (:issue:`10167`)
3030
By `Maximilian Roos <https://github.com/max-sixty>`_.
3131

32-
- The default ``engine`` when reading/writing netCDF files in-memory is now
33-
netCDF4, consistent with Xarray's default ``engine`` when read/writing netCDF
34-
files to disk (:pull:`10624`).
32+
- The default ``engine`` when reading/writing netCDF files is now h5netcdf
33+
or scipy, which are typically faster than the prior default of netCDF4-python.
34+
You can control this default behavior explicitly via the new
35+
``netcdf_engine_order`` parameter in :py:func:`~xarray.set_options`, e.g.,
36+
``xr.set_options(netcdf_engine_order=['netcdf4', 'scipy', 'h5netcdf'])`` to
37+
restore the prior defaults (:issue:`10657`).
3538
By `Stephan Hoyer <https://github.com/shoyer>`_.
3639

3740
Deprecations

xarray/backends/api.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -415,9 +415,10 @@ def open_dataset(
415415
, installed backend \
416416
or subclass of xarray.backends.BackendEntrypoint, optional
417417
Engine to use when reading files. If not provided, the default engine
418-
is chosen based on available dependencies, with a preference for
419-
"netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``)
420-
can also be used.
418+
is chosen based on available dependencies, by default preferring
419+
"h5netcdf" over "scipy" over "netcdf4" (customizable via
420+
``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend
421+
class (a subclass of ``BackendEntrypoint``) can also be used.
421422
chunks : int, dict, 'auto' or None, default: None
422423
If provided, used to load the data into dask arrays.
423424
@@ -658,8 +659,10 @@ def open_dataarray(
658659
, installed backend \
659660
or subclass of xarray.backends.BackendEntrypoint, optional
660661
Engine to use when reading files. If not provided, the default engine
661-
is chosen based on available dependencies, with a preference for
662-
"netcdf4".
662+
is chosen based on available dependencies, by default preferring
663+
"h5netcdf" over "scipy" over "netcdf4" (customizable via
664+
``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend
665+
class (a subclass of ``BackendEntrypoint``) can also be used.
663666
chunks : int, dict, 'auto' or None, default: None
664667
If provided, used to load the data into dask arrays.
665668
@@ -882,9 +885,10 @@ def open_datatree(
882885
engine : {"netcdf4", "h5netcdf", "zarr", None}, \
883886
installed backend or xarray.backends.BackendEntrypoint, optional
884887
Engine to use when reading files. If not provided, the default engine
885-
is chosen based on available dependencies, with a preference for
886-
"netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``)
887-
can also be used.
888+
is chosen based on available dependencies, by default preferring
889+
"h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in
890+
``xarray.set_options()``). A custom backend class (a subclass of
891+
``BackendEntrypoint``) can also be used.
888892
chunks : int, dict, 'auto' or None, default: None
889893
If provided, used to load the data into dask arrays.
890894
@@ -1040,7 +1044,7 @@ def open_datatree(
10401044
kwargs.update(backend_kwargs)
10411045

10421046
if engine is None:
1043-
engine = plugins.guess_engine(filename_or_obj)
1047+
engine = plugins.guess_engine(filename_or_obj, must_support_groups=True)
10441048

10451049
if from_array_kwargs is None:
10461050
from_array_kwargs = {}
@@ -1126,8 +1130,10 @@ def open_groups(
11261130
engine : {"netcdf4", "h5netcdf", "zarr", None}, \
11271131
installed backend or xarray.backends.BackendEntrypoint, optional
11281132
Engine to use when reading files. If not provided, the default engine
1129-
is chosen based on available dependencies, with a preference for
1130-
"netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``)
1133+
is chosen based on available dependencies, by default preferring
1134+
"h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in
1135+
``xarray.set_options()``). A custom backend class (a subclass of
1136+
``BackendEntrypoint``) can also be used.
11311137
can also be used.
11321138
chunks : int, dict, 'auto' or None, default: None
11331139
If provided, used to load the data into dask arrays.
@@ -1283,7 +1289,7 @@ def open_groups(
12831289
kwargs.update(backend_kwargs)
12841290

12851291
if engine is None:
1286-
engine = plugins.guess_engine(filename_or_obj)
1292+
engine = plugins.guess_engine(filename_or_obj, must_support_groups=True)
12871293

12881294
if from_array_kwargs is None:
12891295
from_array_kwargs = {}
@@ -1443,8 +1449,10 @@ def open_mfdataset(
14431449
, installed backend \
14441450
or subclass of xarray.backends.BackendEntrypoint, optional
14451451
Engine to use when reading files. If not provided, the default engine
1446-
is chosen based on available dependencies, with a preference for
1447-
"netcdf4".
1452+
is chosen based on available dependencies, by default preferring
1453+
"h5netcdf" over "scipy" over "netcdf4" (customizable via
1454+
``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend
1455+
class (a subclass of ``BackendEntrypoint``) can also be used.
14481456
data_vars : {"minimal", "different", "all"} or list of str, default: "all"
14491457
These data variables will be concatenated together:
14501458
* "minimal": Only data variables in which the dimension already

xarray/backends/common.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,18 @@
5353

5454

5555
@overload
56-
def _normalize_path(path: str | os.PathLike) -> str: ...
56+
def _normalize_path(path: os.PathLike) -> str: ...
57+
58+
59+
@overload
60+
def _normalize_path(path: str) -> str: ...
5761

5862

5963
@overload
6064
def _normalize_path(path: T) -> T: ...
6165

6266

63-
def _normalize_path(path: str | os.PathLike | T) -> str | T:
67+
def _normalize_path(path: os.PathLike | str | T) -> str | T:
6468
"""
6569
Normalize pathlikes to string.
6670
@@ -85,7 +89,7 @@ def _normalize_path(path: str | os.PathLike | T) -> str | T:
8589
if isinstance(path, str) and not is_remote_uri(path):
8690
path = os.path.abspath(os.path.expanduser(path))
8791

88-
return path # type:ignore [return-value]
92+
return path # type: ignore[return-value]
8993

9094

9195
@overload
@@ -749,11 +753,15 @@ class BackendEntrypoint:
749753
url : str, default: ""
750754
A string with the URL to the backend's documentation.
751755
The setting of this attribute is not mandatory.
756+
supports_groups : bool, default: False
757+
Whether the backend supports opening groups (via open_datatree and
758+
open_groups_as_dict) or not.
752759
"""
753760

754761
open_dataset_parameters: ClassVar[tuple | None] = None
755762
description: ClassVar[str] = ""
756763
url: ClassVar[str] = ""
764+
supports_groups: ClassVar[bool] = False
757765

758766
def __repr__(self) -> str:
759767
txt = f"<{type(self).__name__}>"
@@ -808,6 +816,8 @@ def open_datatree(
808816
) -> DataTree:
809817
"""
810818
Backend open_datatree method used by Xarray in :py:func:`~xarray.open_datatree`.
819+
820+
If implemented, set the class variable supports_groups to True.
811821
"""
812822

813823
raise NotImplementedError()
@@ -830,6 +840,8 @@ def open_groups_as_dict(
830840
This function exists to provide a universal way to open all groups in a file,
831841
before applying any additional consistency checks or requirements necessary
832842
to create a `DataTree` object (typically done using :py:meth:`~xarray.DataTree.from_dict`).
843+
844+
If implemented, set the class variable supports_groups to True.
833845
"""
834846

835847
raise NotImplementedError()

xarray/backends/h5netcdf_.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint):
459459
"Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using h5netcdf in Xarray"
460460
)
461461
url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.H5netcdfBackendEntrypoint.html"
462+
supports_groups = True
462463

463464
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
464465
filename_or_obj = _normalize_filename_or_obj(filename_or_obj)

xarray/backends/netCDF4_.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint):
698698
"Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using netCDF4 in Xarray"
699699
)
700700
url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.NetCDF4BackendEntrypoint.html"
701+
supports_groups = True
701702

702703
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
703704
if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj):

xarray/backends/plugins.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import TYPE_CHECKING, Any
1010

1111
from xarray.backends.common import BACKEND_ENTRYPOINTS, BackendEntrypoint
12+
from xarray.core.options import OPTIONS
1213
from xarray.core.utils import module_available
1314

1415
if TYPE_CHECKING:
@@ -18,8 +19,6 @@
1819
from xarray.backends.common import AbstractDataStore
1920
from xarray.core.types import ReadBuffer
2021

21-
NETCDF_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"]
22-
2322

2423
def remove_duplicates(entrypoints: EntryPoints) -> list[EntryPoint]:
2524
# sort and group entrypoints by name
@@ -91,8 +90,8 @@ def set_missing_parameters(
9190
def sort_backends(
9291
backend_entrypoints: dict[str, type[BackendEntrypoint]],
9392
) -> dict[str, type[BackendEntrypoint]]:
94-
ordered_backends_entrypoints = {}
95-
for be_name in NETCDF_BACKENDS_ORDER:
93+
ordered_backends_entrypoints: dict[str, type[BackendEntrypoint]] = {}
94+
for be_name in OPTIONS["netcdf_engine_order"]:
9695
if backend_entrypoints.get(be_name) is not None:
9796
ordered_backends_entrypoints[be_name] = backend_entrypoints.pop(be_name)
9897
ordered_backends_entrypoints.update(
@@ -144,10 +143,13 @@ def guess_engine(
144143
| bytes
145144
| memoryview
146145
| AbstractDataStore,
146+
must_support_groups: bool = False,
147147
) -> str | type[BackendEntrypoint]:
148148
engines = list_engines()
149149

150150
for engine, backend in engines.items():
151+
if must_support_groups and not backend.supports_groups:
152+
continue
151153
try:
152154
if backend.guess_can_open(store_spec):
153155
return engine
@@ -162,6 +164,8 @@ def guess_engine(
162164
for engine, (_, backend_cls) in BACKEND_ENTRYPOINTS.items():
163165
try:
164166
backend = backend_cls()
167+
if must_support_groups and not backend.supports_groups:
168+
continue
165169
if backend.guess_can_open(store_spec):
166170
compatible_engines.append(engine)
167171
except Exception:
@@ -180,6 +184,15 @@ def guess_engine(
180184
"https://docs.xarray.dev/en/stable/getting-started-guide/installing.html\n"
181185
"https://docs.xarray.dev/en/stable/user-guide/io.html"
182186
)
187+
elif must_support_groups:
188+
error_msg = (
189+
"xarray is unable to open this file because it has no currently "
190+
"installed IO backends that support reading groups (e.g., h5netcdf "
191+
"or netCDF4-python). Xarray's read/write support requires "
192+
"installing optional IO dependencies, see:\n"
193+
"https://docs.xarray.dev/en/stable/getting-started-guide/installing.html\n"
194+
"https://docs.xarray.dev/en/stable/user-guide/io"
195+
)
183196
else:
184197
error_msg = (
185198
"xarray is unable to open this file because it has no currently "

xarray/backends/scipy_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def _normalize_filename_or_obj(
323323
if isinstance(filename_or_obj, bytes | memoryview):
324324
return io.BytesIO(filename_or_obj)
325325
else:
326-
return _normalize_path(filename_or_obj) # type: ignore[return-value]
326+
return _normalize_path(filename_or_obj)
327327

328328

329329
class ScipyBackendEntrypoint(BackendEntrypoint):

0 commit comments

Comments
 (0)