From 2af21abbb9874799f3a48ce6865e815b7a5c0ee6 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Tue, 30 Sep 2025 12:58:58 -0400 Subject: [PATCH 1/9] fix: be more more caution when claiming a backend can open a URL --- xarray/backends/netCDF4_.py | 7 +++- xarray/backends/pydap_.py | 10 ++++- xarray/tests/test_backends.py | 71 +++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 8d4ca6441c9..cf398c11085 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -702,7 +702,12 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - return True + # For remote URIs, check file extension to avoid claiming non-netCDF URLs + # (e.g., remote Zarr stores) + _, ext = os.path.splitext(filename_or_obj.rstrip("/")) + # Accept remote URIs with netCDF extensions or no extension + # (OPeNDAP endpoints often have no extension) + return ext in {".nc", ".nc4", ".cdf", ""} magic_number = ( bytes(filename_or_obj[:8]) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4fbfe8ee210..88325b2c9d2 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -209,7 +210,14 @@ class PydapBackendEntrypoint(BackendEntrypoint): url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) + if not (isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)): + return False + + # Check file extension to avoid claiming non-OPeNDAP URLs (e.g., remote Zarr stores) + _, ext = os.path.splitext(filename_or_obj.rstrip("/")) + # Pydap handles OPeNDAP endpoints, which typically have no extension or .nc/.nc4 + # Reject URLs with non-OPeNDAP extensions like .zarr + return ext not in {".zarr", ".zip", ".tar", ".gz"} def open_dataset( self, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7df9596b1ae..0190c0e4431 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7252,6 +7252,77 @@ def test_zarr_entrypoint(tmp_path: Path) -> None: assert not entrypoint.guess_can_open("something.zarr.txt") +@requires_netCDF4 +@requires_pydap +@requires_zarr +def test_remote_url_backend_auto_detection() -> None: + """ + Test that remote URLs are correctly claimed by appropriate backends. + + This tests the fix for issue where netCDF4 and pydap backends were + claiming ALL remote URLs, preventing remote Zarr stores from being + auto-detected. + + See: https://github.com/pydata/xarray/issues/XXXXX + """ + from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint + from xarray.backends.pydap_ import PydapBackendEntrypoint + from xarray.backends.zarr import ZarrBackendEntrypoint + + netcdf4_entrypoint = NetCDF4BackendEntrypoint() + pydap_entrypoint = PydapBackendEntrypoint() + zarr_entrypoint = ZarrBackendEntrypoint() + + # Remote Zarr URLs should be claimed by Zarr backend, not netCDF4/pydap + remote_zarr_urls = [ + "https://example.com/store.zarr", + "http://example.com/data.zarr/", + "s3://bucket/path/to/data.zarr", + ] + + for url in remote_zarr_urls: + assert zarr_entrypoint.guess_can_open(url), f"Zarr should claim {url}" + assert not netcdf4_entrypoint.guess_can_open(url), ( + f"NetCDF4 should not claim {url}" + ) + assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" + + # Remote netCDF URLs with extensions should be claimed by netCDF4, not Zarr + remote_netcdf_urls_with_ext = [ + "https://example.com/file.nc", + "http://example.com/data.nc4", + "https://example.com/test.cdf", + ] + + for url in remote_netcdf_urls_with_ext: + assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" + assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" + + # OPeNDAP endpoints (no extension) should be claimed by both netCDF4 and pydap + opendap_urls = [ + "http://opendap.example.com/data", + "https://test.opendap.org/dataset", + ] + + for url in opendap_urls: + assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" + assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" + assert pydap_entrypoint.guess_can_open(url), f"Pydap should claim {url}" + + # Other file types should not be claimed + other_urls = [ + "https://example.com/data.zip", + "https://example.com/data.tar.gz", + ] + + for url in other_urls: + assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" + assert not netcdf4_entrypoint.guess_can_open(url), ( + f"NetCDF4 should not claim {url}" + ) + assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" + + @requires_netCDF4 @pytest.mark.parametrize("str_type", (str, np.str_)) def test_write_file_from_np_str(str_type: type[str | np.str_], tmpdir: str) -> None: From 1a3e7dfb3ad5f079df0c98e7a4182957ce845d26 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Tue, 30 Sep 2025 14:13:00 -0400 Subject: [PATCH 2/9] add whats new entry --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8ffab2889f..b7f6bce85ea 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,9 @@ Deprecations Bug fixes ~~~~~~~~~ - +- ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing + the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument. + (:pull:`10804`). By `Ian Hunt-Isaak Date: Tue, 30 Sep 2025 15:37:05 -0400 Subject: [PATCH 3/9] fixes from review --- doc/whats-new.rst | 2 +- xarray/tests/test_backends.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7f6bce85ea..41a5f93a67d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,7 +26,7 @@ Bug fixes ~~~~~~~~~ - ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument. - (:pull:`10804`). By `Ian Hunt-Isaak `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0190c0e4431..d7d56acbb2e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7263,7 +7263,7 @@ def test_remote_url_backend_auto_detection() -> None: claiming ALL remote URLs, preventing remote Zarr stores from being auto-detected. - See: https://github.com/pydata/xarray/issues/XXXXX + See: https://github.com/pydata/xarray/issues/10801 """ from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint from xarray.backends.pydap_ import PydapBackendEntrypoint From 7ed1f0ad231c9a666fdc65cb921595ae67f88262 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 1 Oct 2025 11:16:14 -0400 Subject: [PATCH 4/9] more caution in scipy netcdf backend --- xarray/backends/scipy_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 5ac5008098b..08c161a1926 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -330,7 +330,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the scipy package. - It can open ".nc", ".nc4", ".cdf" and ".gz" files but will only be + It can open ".nc", ".cdf", and "nc..gz" files but will only be selected as the default if the "netcdf4" and "h5netcdf" engines are not available. It has the advantage that is is a lightweight engine that has no system requirements (unlike netcdf4 and h5netcdf). @@ -347,7 +347,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): backends.H5netcdfBackendEntrypoint """ - description = "Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray" + description = "Open netCDF files (.nc, .cdf and .nc.gz) using scipy in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html" def guess_can_open( @@ -364,7 +364,7 @@ def guess_can_open( if isinstance(filename_or_obj, str | os.PathLike): _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".nc4", ".cdf", ".gz"} + return ext in {".nc", ".cdf", ".nc.gz"} return False From 60c11585e07ec768833ffc6dcb155a6115b829a6 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 1 Oct 2025 11:27:19 -0400 Subject: [PATCH 5/9] correct suffix detection for scipy backend --- xarray/backends/scipy_.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 08c161a1926..59ed3743bbc 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -363,8 +363,10 @@ def guess_can_open( return magic_number.startswith(b"CDF") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".cdf", ".nc.gz"} + from pathlib import Path + + suffix = "".join(Path(filename_or_obj).suffixes) + return suffix in {".nc", ".cdf", ".nc.gz"} return False From d2334e42781a729dfca505f6f159dd0e7d49bef1 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 3 Oct 2025 14:17:36 -0400 Subject: [PATCH 6/9] stricter URL detection for netcdf/dap --- doc/whats-new.rst | 8 ++- xarray/backends/h5netcdf_.py | 8 ++- xarray/backends/netCDF4_.py | 44 ++++++++------ xarray/backends/pydap_.py | 21 +++++-- xarray/core/utils.py | 34 +++++++++++ xarray/tests/test_backends.py | 111 ++++++++++++++++++---------------- 6 files changed, 146 insertions(+), 80 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 41a5f93a67d..e60b0417213 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,9 +24,11 @@ Deprecations Bug fixes ~~~~~~~~~ -- ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing - the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument. - (:pull:`10804`). By `Ian Hunt-Isaak `_. +- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming + remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators + (``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents + both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit + ``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 28565f92de9..1801ab9b6f4 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,13 +462,19 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + from xarray.core.utils import is_remote_uri, strip_uri_params + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) + path = str(filename_or_obj) + # For remote URIs, strip query parameters and fragments before checking extension + if isinstance(filename_or_obj, str) and is_remote_uri(path): + path = strip_uri_params(path) + _, ext = os.path.splitext(path) return ext in {".nc", ".nc4", ".cdf"} return False diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index cf398c11085..e6782723f74 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -701,26 +701,36 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + # Helper to check if magic number is netCDF or HDF5 + def _is_netcdf_magic(magic: bytes) -> bool: + return magic.startswith((b"CDF", b"\211HDF\r\n\032\n")) + + # Helper to check if extension is netCDF + def _has_netcdf_ext(path: str | os.PathLike, is_remote: bool = False) -> bool: + from xarray.core.utils import strip_uri_params + + path = str(path).rstrip("/") + # For remote URIs, strip query parameters and fragments + if is_remote: + path = strip_uri_params(path) + _, ext = os.path.splitext(path) + return ext in {".nc", ".nc4", ".cdf"} + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - # For remote URIs, check file extension to avoid claiming non-netCDF URLs - # (e.g., remote Zarr stores) - _, ext = os.path.splitext(filename_or_obj.rstrip("/")) - # Accept remote URIs with netCDF extensions or no extension - # (OPeNDAP endpoints often have no extension) - return ext in {".nc", ".nc4", ".cdf", ""} - - magic_number = ( - bytes(filename_or_obj[:8]) - if isinstance(filename_or_obj, bytes | memoryview) - else try_read_magic_number_from_path(filename_or_obj) - ) - if magic_number is not None: - # netcdf 3 or HDF5 - return magic_number.startswith((b"CDF", b"\211HDF\r\n\032\n")) + # For remote URIs, check extension (accounting for query params/fragments) + # Remote netcdf-c can handle both regular URLs and DAP URLs + return _has_netcdf_ext(filename_or_obj, is_remote=True) if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".nc4", ".cdf"} + # For local paths, check magic number first, then extension + magic_number = try_read_magic_number_from_path(filename_or_obj) + if magic_number is not None: + return _is_netcdf_magic(magic_number) + # No magic number available, fallback to extension + return _has_netcdf_ext(filename_or_obj) + + if isinstance(filename_or_obj, bytes | memoryview): + return _is_netcdf_magic(bytes(filename_or_obj[:8])) return False diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 88325b2c9d2..4883efe187f 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -210,14 +210,23 @@ class PydapBackendEntrypoint(BackendEntrypoint): url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - if not (isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)): + if not isinstance(filename_or_obj, str): return False - # Check file extension to avoid claiming non-OPeNDAP URLs (e.g., remote Zarr stores) - _, ext = os.path.splitext(filename_or_obj.rstrip("/")) - # Pydap handles OPeNDAP endpoints, which typically have no extension or .nc/.nc4 - # Reject URLs with non-OPeNDAP extensions like .zarr - return ext not in {".zarr", ".zip", ".tar", ".gz"} + # Check for explicit DAP protocol indicators: + # 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri) + # 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive) + # Note: We intentionally do NOT check for .dap suffix as that would match + # file extensions like .dap which trigger downloads of binary data + url_lower = filename_or_obj.lower() + if url_lower.startswith(("dap2://", "dap4://")): + return True + + # For standard remote URIs, check for DAP indicators in path + if is_remote_uri(filename_or_obj): + return "/dap2/" in url_lower or "/dap4/" in url_lower + + return False def open_dataset( self, diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ec4edf255f6..db5827148ab 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -728,6 +728,40 @@ def is_remote_uri(path: str) -> bool: return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) +def strip_uri_params(uri: str) -> str: + """Strip query parameters and fragments from a URI. + + This is useful for extracting the file extension from URLs that + contain query parameters (e.g., OPeNDAP constraint expressions). + + Parameters + ---------- + uri : str + The URI to strip + + Returns + ------- + str + The URI without query parameters (?) or fragments (#) + + Examples + -------- + >>> strip_uri_params("http://example.com/file.nc?var=temp&time=0") + 'http://example.com/file.nc' + >>> strip_uri_params("http://example.com/file.nc#section") + 'http://example.com/file.nc' + >>> strip_uri_params("/local/path/file.nc") + '/local/path/file.nc' + """ + from urllib.parse import urlsplit, urlunsplit + + # Use urlsplit to properly parse the URI + # This handles both absolute URLs and relative paths + parsed = urlsplit(uri) + # Reconstruct without query and fragment using urlunsplit + return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", "")) + + def read_magic_number_from_file(filename_or_obj, count=8) -> bytes: # check byte header to determine file type if not isinstance(filename_or_obj, io.IOBase): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d7d56acbb2e..2feb31504b2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7159,7 +7159,10 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: _check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds) _check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds) - assert entrypoint.guess_can_open("http://something/remote") + # Remote URLs without extensions are no longer claimed (stricter detection) + assert not entrypoint.guess_can_open("http://something/remote") + # Remote URLs with netCDF extensions are claimed + assert entrypoint.guess_can_open("http://something/remote.nc") assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc4") assert entrypoint.guess_can_open("something-local.cdf") @@ -7202,6 +7205,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.nc.gz") assert not entrypoint.guess_can_open("not-found-and-no-extension") assert not entrypoint.guess_can_open(b"not-a-netcdf-file") + # Should not claim .gz files that aren't netCDF + assert not entrypoint.guess_can_open("something.zarr.gz") + assert not entrypoint.guess_can_open("something.tar.gz") + assert not entrypoint.guess_can_open("something.txt.gz") @requires_h5netcdf @@ -7252,75 +7259,73 @@ def test_zarr_entrypoint(tmp_path: Path) -> None: assert not entrypoint.guess_can_open("something.zarr.txt") +@requires_h5netcdf @requires_netCDF4 @requires_pydap @requires_zarr def test_remote_url_backend_auto_detection() -> None: """ - Test that remote URLs are correctly claimed by appropriate backends. + Test that remote URLs are correctly selected by the backend resolution system. - This tests the fix for issue where netCDF4 and pydap backends were + This tests the fix for issue where netCDF4, h5netcdf, and pydap backends were claiming ALL remote URLs, preventing remote Zarr stores from being auto-detected. See: https://github.com/pydata/xarray/issues/10801 """ - from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint - from xarray.backends.pydap_ import PydapBackendEntrypoint - from xarray.backends.zarr import ZarrBackendEntrypoint - - netcdf4_entrypoint = NetCDF4BackendEntrypoint() - pydap_entrypoint = PydapBackendEntrypoint() - zarr_entrypoint = ZarrBackendEntrypoint() - - # Remote Zarr URLs should be claimed by Zarr backend, not netCDF4/pydap - remote_zarr_urls = [ - "https://example.com/store.zarr", - "http://example.com/data.zarr/", - "s3://bucket/path/to/data.zarr", + from xarray.backends.plugins import guess_engine + + # Test cases: (url, expected_backend) + test_cases = [ + # Remote Zarr URLs + ("https://example.com/store.zarr", "zarr"), + ("http://example.com/data.zarr/", "zarr"), + ("s3://bucket/path/to/data.zarr", "zarr"), + # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order) + ("https://example.com/file.nc", "h5netcdf"), + ("http://example.com/data.nc4", "h5netcdf"), + ("https://example.com/test.cdf", "h5netcdf"), + ("https://example.com/data.nc?var=temperature&time=0", "h5netcdf"), + # DAP URLs with query parameters - h5netcdf wins (has .nc4 ext, first in order) + ( + "http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]", + "h5netcdf", + ), + # DAP URLs without extensions - pydap wins + ("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), + ("dap4://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), + ("DAP2://example.com/dataset", "pydap"), # uppercase scheme + ("DAP4://example.com/dataset", "pydap"), # uppercase scheme + ("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path + # DAP URLs with .nc extensions - h5netcdf wins (first in order) + ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), + ("https://example.com/DAP4/data.nc", "h5netcdf"), + ("http://example.com/data/Dap4/file.nc", "h5netcdf"), + ("s3://bucket/path/to/data.nc", "h5netcdf"), ] - for url in remote_zarr_urls: - assert zarr_entrypoint.guess_can_open(url), f"Zarr should claim {url}" - assert not netcdf4_entrypoint.guess_can_open(url), ( - f"NetCDF4 should not claim {url}" + for url, expected_backend in test_cases: + engine = guess_engine(url) + assert engine == expected_backend, ( + f"URL {url!r} should select {expected_backend!r} but got {engine!r}" ) - assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" - # Remote netCDF URLs with extensions should be claimed by netCDF4, not Zarr - remote_netcdf_urls_with_ext = [ - "https://example.com/file.nc", - "http://example.com/data.nc4", - "https://example.com/test.cdf", + # URLs that should raise ValueError (no backend can open them) + invalid_urls = [ + "http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix + "https://example.com/data.dap", # .dap suffix + "http://opendap.example.com/data", # no extension, no DAP indicators + "https://test.opendap.org/dataset", # no extension, no DAP indicators ] - for url in remote_netcdf_urls_with_ext: - assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" - assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" - - # OPeNDAP endpoints (no extension) should be claimed by both netCDF4 and pydap - opendap_urls = [ - "http://opendap.example.com/data", - "https://test.opendap.org/dataset", - ] - - for url in opendap_urls: - assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" - assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" - assert pydap_entrypoint.guess_can_open(url), f"Pydap should claim {url}" - - # Other file types should not be claimed - other_urls = [ - "https://example.com/data.zip", - "https://example.com/data.tar.gz", - ] - - for url in other_urls: - assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" - assert not netcdf4_entrypoint.guess_can_open(url), ( - f"NetCDF4 should not claim {url}" - ) - assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" + for url in invalid_urls: + try: + engine = guess_engine(url) + raise AssertionError( + f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it" + ) + except ValueError: + pass # Expected @requires_netCDF4 From ef3e07c0b17adbd11174f145b66a1b77e154a8c4 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 3 Oct 2025 14:24:28 -0400 Subject: [PATCH 7/9] no query params for h5netcdf --- xarray/backends/h5netcdf_.py | 8 +------- xarray/tests/test_backends.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 1801ab9b6f4..4ea29014a68 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,19 +462,13 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - from xarray.core.utils import is_remote_uri, strip_uri_params - filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): - path = str(filename_or_obj) - # For remote URIs, strip query parameters and fragments before checking extension - if isinstance(filename_or_obj, str) and is_remote_uri(path): - path = strip_uri_params(path) - _, ext = os.path.splitext(path) + _, ext = os.path.splitext(str(filename_or_obj)) return ext in {".nc", ".nc4", ".cdf"} return False diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2feb31504b2..5a420aed523 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7281,15 +7281,18 @@ def test_remote_url_backend_auto_detection() -> None: ("https://example.com/store.zarr", "zarr"), ("http://example.com/data.zarr/", "zarr"), ("s3://bucket/path/to/data.zarr", "zarr"), - # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order) + # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order, no query params) ("https://example.com/file.nc", "h5netcdf"), ("http://example.com/data.nc4", "h5netcdf"), ("https://example.com/test.cdf", "h5netcdf"), - ("https://example.com/data.nc?var=temperature&time=0", "h5netcdf"), - # DAP URLs with query parameters - h5netcdf wins (has .nc4 ext, first in order) + ("s3://bucket/path/to/data.nc", "h5netcdf"), + # Remote netCDF URLs with query params - netcdf4 wins + # Note: Query params are typically indicative of DAP URLs (e.g., OPeNDAP constraint expressions), + # so we prefer netcdf4 (which has DAP support) over h5netcdf (which doesn't) + ("https://example.com/data.nc?var=temperature&time=0", "netcdf4"), ( "http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]", - "h5netcdf", + "netcdf4", ), # DAP URLs without extensions - pydap wins ("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), @@ -7297,11 +7300,10 @@ def test_remote_url_backend_auto_detection() -> None: ("DAP2://example.com/dataset", "pydap"), # uppercase scheme ("DAP4://example.com/dataset", "pydap"), # uppercase scheme ("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path - # DAP URLs with .nc extensions - h5netcdf wins (first in order) + # DAP URLs with .nc extensions (no query params) - h5netcdf wins (first in order) ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), ("https://example.com/DAP4/data.nc", "h5netcdf"), ("http://example.com/data/Dap4/file.nc", "h5netcdf"), - ("s3://bucket/path/to/data.nc", "h5netcdf"), ] for url, expected_backend in test_cases: From c07e7ea03d97a4eef67ebb61c6508111a8838698 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 3 Oct 2025 14:41:00 -0400 Subject: [PATCH 8/9] scipy no urls --- xarray/backends/scipy_.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 59ed3743bbc..c7e4956820c 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -335,7 +335,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): not available. It has the advantage that is is a lightweight engine that has no system requirements (unlike netcdf4 and h5netcdf). - Additionally it can open gizp compressed (".gz") files. + Additionally it can open gzip compressed (".gz") files. For more information about the underlying library, visit: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.netcdf_file.html @@ -354,6 +354,8 @@ def guess_can_open( self, filename_or_obj: T_PathFileOrDataStore, ) -> bool: + from xarray.core.utils import is_remote_uri + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None and magic_number.startswith(b"\x1f\x8b"): @@ -365,6 +367,10 @@ def guess_can_open( if isinstance(filename_or_obj, str | os.PathLike): from pathlib import Path + # scipy can only handle local files + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): + return False + suffix = "".join(Path(filename_or_obj).suffixes) return suffix in {".nc", ".cdf", ".nc.gz"} From 9cf669be5e3a6b5cd866a8de22fba719747d6521 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Mon, 6 Oct 2025 13:01:15 -0400 Subject: [PATCH 9/9] don't try to read magic numbers for remote uris --- xarray/backends/h5netcdf_.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4ea29014a68..d685a47b977 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,10 +462,16 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + from xarray.core.utils import is_remote_uri + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) - magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) - if magic_number is not None: - return magic_number.startswith(b"\211HDF\r\n\032\n") + + # Try to read magic number for local files only + is_remote = isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) + if not is_remote: + magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) + if magic_number is not None: + return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): _, ext = os.path.splitext(str(filename_or_obj))