Skip to content

Commit d2334e4

Browse files
committed
stricter URL detection for netcdf/dap
1 parent 60c1158 commit d2334e4

File tree

6 files changed

+146
-80
lines changed

6 files changed

+146
-80
lines changed

doc/whats-new.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ Deprecations
2424

2525
Bug fixes
2626
~~~~~~~~~
27-
- ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing
28-
the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument.
29-
(:pull:`10804`). By `Ian Hunt-Isaak <https://github.com/ianhi>`_.
27+
- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming
28+
remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators
29+
(``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents
30+
both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit
31+
``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak <https://github.com/ianhi>`_.
3032

3133
Documentation
3234
~~~~~~~~~~~~~

xarray/backends/h5netcdf_.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,13 +462,19 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint):
462462
supports_groups = True
463463

464464
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
465+
from xarray.core.utils import is_remote_uri, strip_uri_params
466+
465467
filename_or_obj = _normalize_filename_or_obj(filename_or_obj)
466468
magic_number = try_read_magic_number_from_file_or_path(filename_or_obj)
467469
if magic_number is not None:
468470
return magic_number.startswith(b"\211HDF\r\n\032\n")
469471

470472
if isinstance(filename_or_obj, str | os.PathLike):
471-
_, ext = os.path.splitext(filename_or_obj)
473+
path = str(filename_or_obj)
474+
# For remote URIs, strip query parameters and fragments before checking extension
475+
if isinstance(filename_or_obj, str) and is_remote_uri(path):
476+
path = strip_uri_params(path)
477+
_, ext = os.path.splitext(path)
472478
return ext in {".nc", ".nc4", ".cdf"}
473479

474480
return False

xarray/backends/netCDF4_.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -701,26 +701,36 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint):
701701
supports_groups = True
702702

703703
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
704+
# Helper to check if magic number is netCDF or HDF5
705+
def _is_netcdf_magic(magic: bytes) -> bool:
706+
return magic.startswith((b"CDF", b"\211HDF\r\n\032\n"))
707+
708+
# Helper to check if extension is netCDF
709+
def _has_netcdf_ext(path: str | os.PathLike, is_remote: bool = False) -> bool:
710+
from xarray.core.utils import strip_uri_params
711+
712+
path = str(path).rstrip("/")
713+
# For remote URIs, strip query parameters and fragments
714+
if is_remote:
715+
path = strip_uri_params(path)
716+
_, ext = os.path.splitext(path)
717+
return ext in {".nc", ".nc4", ".cdf"}
718+
704719
if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj):
705-
# For remote URIs, check file extension to avoid claiming non-netCDF URLs
706-
# (e.g., remote Zarr stores)
707-
_, ext = os.path.splitext(filename_or_obj.rstrip("/"))
708-
# Accept remote URIs with netCDF extensions or no extension
709-
# (OPeNDAP endpoints often have no extension)
710-
return ext in {".nc", ".nc4", ".cdf", ""}
711-
712-
magic_number = (
713-
bytes(filename_or_obj[:8])
714-
if isinstance(filename_or_obj, bytes | memoryview)
715-
else try_read_magic_number_from_path(filename_or_obj)
716-
)
717-
if magic_number is not None:
718-
# netcdf 3 or HDF5
719-
return magic_number.startswith((b"CDF", b"\211HDF\r\n\032\n"))
720+
# For remote URIs, check extension (accounting for query params/fragments)
721+
# Remote netcdf-c can handle both regular URLs and DAP URLs
722+
return _has_netcdf_ext(filename_or_obj, is_remote=True)
720723

721724
if isinstance(filename_or_obj, str | os.PathLike):
722-
_, ext = os.path.splitext(filename_or_obj)
723-
return ext in {".nc", ".nc4", ".cdf"}
725+
# For local paths, check magic number first, then extension
726+
magic_number = try_read_magic_number_from_path(filename_or_obj)
727+
if magic_number is not None:
728+
return _is_netcdf_magic(magic_number)
729+
# No magic number available, fallback to extension
730+
return _has_netcdf_ext(filename_or_obj)
731+
732+
if isinstance(filename_or_obj, bytes | memoryview):
733+
return _is_netcdf_magic(bytes(filename_or_obj[:8]))
724734

725735
return False
726736

xarray/backends/pydap_.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,14 +210,23 @@ class PydapBackendEntrypoint(BackendEntrypoint):
210210
url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html"
211211

212212
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
213-
if not (isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)):
213+
if not isinstance(filename_or_obj, str):
214214
return False
215215

216-
# Check file extension to avoid claiming non-OPeNDAP URLs (e.g., remote Zarr stores)
217-
_, ext = os.path.splitext(filename_or_obj.rstrip("/"))
218-
# Pydap handles OPeNDAP endpoints, which typically have no extension or .nc/.nc4
219-
# Reject URLs with non-OPeNDAP extensions like .zarr
220-
return ext not in {".zarr", ".zip", ".tar", ".gz"}
216+
# Check for explicit DAP protocol indicators:
217+
# 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri)
218+
# 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive)
219+
# Note: We intentionally do NOT check for .dap suffix as that would match
220+
# file extensions like .dap which trigger downloads of binary data
221+
url_lower = filename_or_obj.lower()
222+
if url_lower.startswith(("dap2://", "dap4://")):
223+
return True
224+
225+
# For standard remote URIs, check for DAP indicators in path
226+
if is_remote_uri(filename_or_obj):
227+
return "/dap2/" in url_lower or "/dap4/" in url_lower
228+
229+
return False
221230

222231
def open_dataset(
223232
self,

xarray/core/utils.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,40 @@ def is_remote_uri(path: str) -> bool:
728728
return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path))
729729

730730

731+
def strip_uri_params(uri: str) -> str:
732+
"""Strip query parameters and fragments from a URI.
733+
734+
This is useful for extracting the file extension from URLs that
735+
contain query parameters (e.g., OPeNDAP constraint expressions).
736+
737+
Parameters
738+
----------
739+
uri : str
740+
The URI to strip
741+
742+
Returns
743+
-------
744+
str
745+
The URI without query parameters (?) or fragments (#)
746+
747+
Examples
748+
--------
749+
>>> strip_uri_params("http://example.com/file.nc?var=temp&time=0")
750+
'http://example.com/file.nc'
751+
>>> strip_uri_params("http://example.com/file.nc#section")
752+
'http://example.com/file.nc'
753+
>>> strip_uri_params("/local/path/file.nc")
754+
'/local/path/file.nc'
755+
"""
756+
from urllib.parse import urlsplit, urlunsplit
757+
758+
# Use urlsplit to properly parse the URI
759+
# This handles both absolute URLs and relative paths
760+
parsed = urlsplit(uri)
761+
# Reconstruct without query and fragment using urlunsplit
762+
return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", ""))
763+
764+
731765
def read_magic_number_from_file(filename_or_obj, count=8) -> bytes:
732766
# check byte header to determine file type
733767
if not isinstance(filename_or_obj, io.IOBase):

xarray/tests/test_backends.py

Lines changed: 58 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7159,7 +7159,10 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None:
71597159
_check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds)
71607160
_check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds)
71617161

7162-
assert entrypoint.guess_can_open("http://something/remote")
7162+
# Remote URLs without extensions are no longer claimed (stricter detection)
7163+
assert not entrypoint.guess_can_open("http://something/remote")
7164+
# Remote URLs with netCDF extensions are claimed
7165+
assert entrypoint.guess_can_open("http://something/remote.nc")
71637166
assert entrypoint.guess_can_open("something-local.nc")
71647167
assert entrypoint.guess_can_open("something-local.nc4")
71657168
assert entrypoint.guess_can_open("something-local.cdf")
@@ -7202,6 +7205,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None:
72027205
assert entrypoint.guess_can_open("something-local.nc.gz")
72037206
assert not entrypoint.guess_can_open("not-found-and-no-extension")
72047207
assert not entrypoint.guess_can_open(b"not-a-netcdf-file")
7208+
# Should not claim .gz files that aren't netCDF
7209+
assert not entrypoint.guess_can_open("something.zarr.gz")
7210+
assert not entrypoint.guess_can_open("something.tar.gz")
7211+
assert not entrypoint.guess_can_open("something.txt.gz")
72057212

72067213

72077214
@requires_h5netcdf
@@ -7252,75 +7259,73 @@ def test_zarr_entrypoint(tmp_path: Path) -> None:
72527259
assert not entrypoint.guess_can_open("something.zarr.txt")
72537260

72547261

7262+
@requires_h5netcdf
72557263
@requires_netCDF4
72567264
@requires_pydap
72577265
@requires_zarr
72587266
def test_remote_url_backend_auto_detection() -> None:
72597267
"""
7260-
Test that remote URLs are correctly claimed by appropriate backends.
7268+
Test that remote URLs are correctly selected by the backend resolution system.
72617269
7262-
This tests the fix for issue where netCDF4 and pydap backends were
7270+
This tests the fix for issue where netCDF4, h5netcdf, and pydap backends were
72637271
claiming ALL remote URLs, preventing remote Zarr stores from being
72647272
auto-detected.
72657273
72667274
See: https://github.com/pydata/xarray/issues/10801
72677275
"""
7268-
from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint
7269-
from xarray.backends.pydap_ import PydapBackendEntrypoint
7270-
from xarray.backends.zarr import ZarrBackendEntrypoint
7271-
7272-
netcdf4_entrypoint = NetCDF4BackendEntrypoint()
7273-
pydap_entrypoint = PydapBackendEntrypoint()
7274-
zarr_entrypoint = ZarrBackendEntrypoint()
7275-
7276-
# Remote Zarr URLs should be claimed by Zarr backend, not netCDF4/pydap
7277-
remote_zarr_urls = [
7278-
"https://example.com/store.zarr",
7279-
"http://example.com/data.zarr/",
7280-
"s3://bucket/path/to/data.zarr",
7276+
from xarray.backends.plugins import guess_engine
7277+
7278+
# Test cases: (url, expected_backend)
7279+
test_cases = [
7280+
# Remote Zarr URLs
7281+
("https://example.com/store.zarr", "zarr"),
7282+
("http://example.com/data.zarr/", "zarr"),
7283+
("s3://bucket/path/to/data.zarr", "zarr"),
7284+
# Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order)
7285+
("https://example.com/file.nc", "h5netcdf"),
7286+
("http://example.com/data.nc4", "h5netcdf"),
7287+
("https://example.com/test.cdf", "h5netcdf"),
7288+
("https://example.com/data.nc?var=temperature&time=0", "h5netcdf"),
7289+
# DAP URLs with query parameters - h5netcdf wins (has .nc4 ext, first in order)
7290+
(
7291+
"http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]",
7292+
"h5netcdf",
7293+
),
7294+
# DAP URLs without extensions - pydap wins
7295+
("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"),
7296+
("dap4://opendap.earthdata.nasa.gov/collections/dataset", "pydap"),
7297+
("DAP2://example.com/dataset", "pydap"), # uppercase scheme
7298+
("DAP4://example.com/dataset", "pydap"), # uppercase scheme
7299+
("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path
7300+
# DAP URLs with .nc extensions - h5netcdf wins (first in order)
7301+
("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"),
7302+
("https://example.com/DAP4/data.nc", "h5netcdf"),
7303+
("http://example.com/data/Dap4/file.nc", "h5netcdf"),
7304+
("s3://bucket/path/to/data.nc", "h5netcdf"),
72817305
]
72827306

7283-
for url in remote_zarr_urls:
7284-
assert zarr_entrypoint.guess_can_open(url), f"Zarr should claim {url}"
7285-
assert not netcdf4_entrypoint.guess_can_open(url), (
7286-
f"NetCDF4 should not claim {url}"
7307+
for url, expected_backend in test_cases:
7308+
engine = guess_engine(url)
7309+
assert engine == expected_backend, (
7310+
f"URL {url!r} should select {expected_backend!r} but got {engine!r}"
72877311
)
7288-
assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}"
72897312

7290-
# Remote netCDF URLs with extensions should be claimed by netCDF4, not Zarr
7291-
remote_netcdf_urls_with_ext = [
7292-
"https://example.com/file.nc",
7293-
"http://example.com/data.nc4",
7294-
"https://example.com/test.cdf",
7313+
# URLs that should raise ValueError (no backend can open them)
7314+
invalid_urls = [
7315+
"http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix
7316+
"https://example.com/data.dap", # .dap suffix
7317+
"http://opendap.example.com/data", # no extension, no DAP indicators
7318+
"https://test.opendap.org/dataset", # no extension, no DAP indicators
72957319
]
72967320

7297-
for url in remote_netcdf_urls_with_ext:
7298-
assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}"
7299-
assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}"
7300-
7301-
# OPeNDAP endpoints (no extension) should be claimed by both netCDF4 and pydap
7302-
opendap_urls = [
7303-
"http://opendap.example.com/data",
7304-
"https://test.opendap.org/dataset",
7305-
]
7306-
7307-
for url in opendap_urls:
7308-
assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}"
7309-
assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}"
7310-
assert pydap_entrypoint.guess_can_open(url), f"Pydap should claim {url}"
7311-
7312-
# Other file types should not be claimed
7313-
other_urls = [
7314-
"https://example.com/data.zip",
7315-
"https://example.com/data.tar.gz",
7316-
]
7317-
7318-
for url in other_urls:
7319-
assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}"
7320-
assert not netcdf4_entrypoint.guess_can_open(url), (
7321-
f"NetCDF4 should not claim {url}"
7322-
)
7323-
assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}"
7321+
for url in invalid_urls:
7322+
try:
7323+
engine = guess_engine(url)
7324+
raise AssertionError(
7325+
f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it"
7326+
)
7327+
except ValueError:
7328+
pass # Expected
73247329

73257330

73267331
@requires_netCDF4

0 commit comments

Comments
 (0)