diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 61302ed23c7..c6eadfb91d9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -2,6 +2,16 @@ .. _whats-new: +Bug fixes +~~~~~~~~~ + +- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming + remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators + (``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents + both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit + ``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak `_. + + What's New ========== @@ -27,12 +37,12 @@ Breaking changes Bug fixes ~~~~~~~~~ - - Fix error raised when writing scalar variables to Zarr with ``region={}`` (:pull:`10796`). By `Stephan Hoyer `_. + .. _whats-new.2025.09.1: v2025.09.1 (September 29, 2025) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 28565f92de9..d685a47b977 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,13 +462,19 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + from xarray.core.utils import is_remote_uri + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) - magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) - if magic_number is not None: - return magic_number.startswith(b"\211HDF\r\n\032\n") + + # Try to read magic number for local files only + is_remote = isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) + if not is_remote: + magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) + if magic_number is not None: + return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) + _, ext = os.path.splitext(str(filename_or_obj)) return ext in {".nc", ".nc4", ".cdf"} return False diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 8d4ca6441c9..e6782723f74 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -701,21 +701,36 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - return True + # Helper to check if magic number is netCDF or HDF5 + def _is_netcdf_magic(magic: bytes) -> bool: + return magic.startswith((b"CDF", b"\211HDF\r\n\032\n")) + + # Helper to check if extension is netCDF + def _has_netcdf_ext(path: str | os.PathLike, is_remote: bool = False) -> bool: + from xarray.core.utils import strip_uri_params + + path = str(path).rstrip("/") + # For remote URIs, strip query parameters and fragments + if is_remote: + path = strip_uri_params(path) + _, ext = os.path.splitext(path) + return ext in {".nc", ".nc4", ".cdf"} - magic_number = ( - bytes(filename_or_obj[:8]) - if isinstance(filename_or_obj, bytes | memoryview) - else try_read_magic_number_from_path(filename_or_obj) - ) - if magic_number is not None: - # netcdf 3 or HDF5 - return magic_number.startswith((b"CDF", b"\211HDF\r\n\032\n")) + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): + # For remote URIs, check extension (accounting for query params/fragments) + # Remote netcdf-c can handle both regular URLs and DAP URLs + return _has_netcdf_ext(filename_or_obj, is_remote=True) if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".nc4", ".cdf"} + # For local paths, check magic number first, then extension + magic_number = try_read_magic_number_from_path(filename_or_obj) + if magic_number is not None: + return _is_netcdf_magic(magic_number) + # No magic number available, fallback to extension + return _has_netcdf_ext(filename_or_obj) + + if isinstance(filename_or_obj, bytes | memoryview): + return _is_netcdf_magic(bytes(filename_or_obj[:8])) return False diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4fbfe8ee210..4883efe187f 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -209,7 +210,23 @@ class PydapBackendEntrypoint(BackendEntrypoint): url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) + if not isinstance(filename_or_obj, str): + return False + + # Check for explicit DAP protocol indicators: + # 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri) + # 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive) + # Note: We intentionally do NOT check for .dap suffix as that would match + # file extensions like .dap which trigger downloads of binary data + url_lower = filename_or_obj.lower() + if url_lower.startswith(("dap2://", "dap4://")): + return True + + # For standard remote URIs, check for DAP indicators in path + if is_remote_uri(filename_or_obj): + return "/dap2/" in url_lower or "/dap4/" in url_lower + + return False def open_dataset( self, diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 5ac5008098b..c7e4956820c 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -330,12 +330,12 @@ class ScipyBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the scipy package. - It can open ".nc", ".nc4", ".cdf" and ".gz" files but will only be + It can open ".nc", ".cdf", and "nc..gz" files but will only be selected as the default if the "netcdf4" and "h5netcdf" engines are not available. It has the advantage that is is a lightweight engine that has no system requirements (unlike netcdf4 and h5netcdf). - Additionally it can open gizp compressed (".gz") files. + Additionally it can open gzip compressed (".gz") files. For more information about the underlying library, visit: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.netcdf_file.html @@ -347,13 +347,15 @@ class ScipyBackendEntrypoint(BackendEntrypoint): backends.H5netcdfBackendEntrypoint """ - description = "Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray" + description = "Open netCDF files (.nc, .cdf and .nc.gz) using scipy in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html" def guess_can_open( self, filename_or_obj: T_PathFileOrDataStore, ) -> bool: + from xarray.core.utils import is_remote_uri + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None and magic_number.startswith(b"\x1f\x8b"): @@ -363,8 +365,14 @@ def guess_can_open( return magic_number.startswith(b"CDF") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".nc4", ".cdf", ".gz"} + from pathlib import Path + + # scipy can only handle local files + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): + return False + + suffix = "".join(Path(filename_or_obj).suffixes) + return suffix in {".nc", ".cdf", ".nc.gz"} return False diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ec4edf255f6..db5827148ab 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -728,6 +728,40 @@ def is_remote_uri(path: str) -> bool: return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) +def strip_uri_params(uri: str) -> str: + """Strip query parameters and fragments from a URI. + + This is useful for extracting the file extension from URLs that + contain query parameters (e.g., OPeNDAP constraint expressions). + + Parameters + ---------- + uri : str + The URI to strip + + Returns + ------- + str + The URI without query parameters (?) or fragments (#) + + Examples + -------- + >>> strip_uri_params("http://example.com/file.nc?var=temp&time=0") + 'http://example.com/file.nc' + >>> strip_uri_params("http://example.com/file.nc#section") + 'http://example.com/file.nc' + >>> strip_uri_params("/local/path/file.nc") + '/local/path/file.nc' + """ + from urllib.parse import urlsplit, urlunsplit + + # Use urlsplit to properly parse the URI + # This handles both absolute URLs and relative paths + parsed = urlsplit(uri) + # Reconstruct without query and fragment using urlunsplit + return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", "")) + + def read_magic_number_from_file(filename_or_obj, count=8) -> bytes: # check byte header to determine file type if not isinstance(filename_or_obj, io.IOBase): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 996644e5c16..5d049aca17f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7167,7 +7167,10 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: _check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds) _check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds) - assert entrypoint.guess_can_open("http://something/remote") + # Remote URLs without extensions are no longer claimed (stricter detection) + assert not entrypoint.guess_can_open("http://something/remote") + # Remote URLs with netCDF extensions are claimed + assert entrypoint.guess_can_open("http://something/remote.nc") assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc4") assert entrypoint.guess_can_open("something-local.cdf") @@ -7210,6 +7213,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.nc.gz") assert not entrypoint.guess_can_open("not-found-and-no-extension") assert not entrypoint.guess_can_open(b"not-a-netcdf-file") + # Should not claim .gz files that aren't netCDF + assert not entrypoint.guess_can_open("something.zarr.gz") + assert not entrypoint.guess_can_open("something.tar.gz") + assert not entrypoint.guess_can_open("something.txt.gz") @requires_h5netcdf @@ -7260,6 +7267,77 @@ def test_zarr_entrypoint(tmp_path: Path) -> None: assert not entrypoint.guess_can_open("something.zarr.txt") +@requires_h5netcdf +@requires_netCDF4 +@requires_pydap +@requires_zarr +def test_remote_url_backend_auto_detection() -> None: + """ + Test that remote URLs are correctly selected by the backend resolution system. + + This tests the fix for issue where netCDF4, h5netcdf, and pydap backends were + claiming ALL remote URLs, preventing remote Zarr stores from being + auto-detected. + + See: https://github.com/pydata/xarray/issues/10801 + """ + from xarray.backends.plugins import guess_engine + + # Test cases: (url, expected_backend) + test_cases = [ + # Remote Zarr URLs + ("https://example.com/store.zarr", "zarr"), + ("http://example.com/data.zarr/", "zarr"), + ("s3://bucket/path/to/data.zarr", "zarr"), + # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order, no query params) + ("https://example.com/file.nc", "h5netcdf"), + ("http://example.com/data.nc4", "h5netcdf"), + ("https://example.com/test.cdf", "h5netcdf"), + ("s3://bucket/path/to/data.nc", "h5netcdf"), + # Remote netCDF URLs with query params - netcdf4 wins + # Note: Query params are typically indicative of DAP URLs (e.g., OPeNDAP constraint expressions), + # so we prefer netcdf4 (which has DAP support) over h5netcdf (which doesn't) + ("https://example.com/data.nc?var=temperature&time=0", "netcdf4"), + ( + "http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]", + "netcdf4", + ), + # DAP URLs without extensions - pydap wins + ("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), + ("dap4://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), + ("DAP2://example.com/dataset", "pydap"), # uppercase scheme + ("DAP4://example.com/dataset", "pydap"), # uppercase scheme + ("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path + # DAP URLs with .nc extensions (no query params) - h5netcdf wins (first in order) + ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), + ("https://example.com/DAP4/data.nc", "h5netcdf"), + ("http://example.com/data/Dap4/file.nc", "h5netcdf"), + ] + + for url, expected_backend in test_cases: + engine = guess_engine(url) + assert engine == expected_backend, ( + f"URL {url!r} should select {expected_backend!r} but got {engine!r}" + ) + + # URLs that should raise ValueError (no backend can open them) + invalid_urls = [ + "http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix + "https://example.com/data.dap", # .dap suffix + "http://opendap.example.com/data", # no extension, no DAP indicators + "https://test.opendap.org/dataset", # no extension, no DAP indicators + ] + + for url in invalid_urls: + try: + engine = guess_engine(url) + raise AssertionError( + f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it" + ) + except ValueError: + pass # Expected + + @requires_netCDF4 @pytest.mark.parametrize("str_type", (str, np.str_)) def test_write_file_from_np_str(str_type: type[str | np.str_], tmpdir: str) -> None: