-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
fix: be more cautious when guessing what a backend can open #10804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2af21ab
1a3e7df
d6a47b7
7ed1f0a
60c1158
d2334e4
ef3e07c
c07e7ea
017713b
9cf669b
e0e2da2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -462,13 +462,19 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): | |
supports_groups = True | ||
|
||
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: | ||
from xarray.core.utils import is_remote_uri | ||
|
||
filename_or_obj = _normalize_filename_or_obj(filename_or_obj) | ||
magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) | ||
if magic_number is not None: | ||
return magic_number.startswith(b"\211HDF\r\n\032\n") | ||
|
||
# Try to read magic number for local files only | ||
is_remote = isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) | ||
if not is_remote: | ||
magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) | ||
if magic_number is not None: | ||
return magic_number.startswith(b"\211HDF\r\n\032\n") | ||
|
||
if isinstance(filename_or_obj, str | os.PathLike): | ||
_, ext = os.path.splitext(filename_or_obj) | ||
_, ext = os.path.splitext(str(filename_or_obj)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this need casting with |
||
return ext in {".nc", ".nc4", ".cdf"} | ||
|
||
return False | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from __future__ import annotations | ||
|
||
import os | ||
from collections.abc import Iterable | ||
from typing import TYPE_CHECKING, Any | ||
|
||
|
@@ -209,7 +210,23 @@ class PydapBackendEntrypoint(BackendEntrypoint): | |
url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" | ||
|
||
def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: | ||
return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) | ||
if not isinstance(filename_or_obj, str): | ||
return False | ||
|
||
# Check for explicit DAP protocol indicators: | ||
# 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri) | ||
# 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive) | ||
# Note: We intentionally do NOT check for .dap suffix as that would match | ||
# file extensions like .dap which trigger downloads of binary data | ||
url_lower = filename_or_obj.lower() | ||
if url_lower.startswith(("dap2://", "dap4://")): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Mikejmnez is it ok that this will accept both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I tried it and it works with pydap. same for DAP4 v dap4 |
||
return True | ||
|
||
# For standard remote URIs, check for DAP indicators in path | ||
if is_remote_uri(filename_or_obj): | ||
return "/dap2/" in url_lower or "/dap4/" in url_lower | ||
|
||
return False | ||
|
||
def open_dataset( | ||
self, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7167,7 +7167,10 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: | |
_check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds) | ||
_check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds) | ||
|
||
assert entrypoint.guess_can_open("http://something/remote") | ||
# Remote URLs without extensions are no longer claimed (stricter detection) | ||
assert not entrypoint.guess_can_open("http://something/remote") | ||
# Remote URLs with netCDF extensions are claimed | ||
assert entrypoint.guess_can_open("http://something/remote.nc") | ||
assert entrypoint.guess_can_open("something-local.nc") | ||
assert entrypoint.guess_can_open("something-local.nc4") | ||
assert entrypoint.guess_can_open("something-local.cdf") | ||
|
@@ -7210,6 +7213,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: | |
assert entrypoint.guess_can_open("something-local.nc.gz") | ||
assert not entrypoint.guess_can_open("not-found-and-no-extension") | ||
assert not entrypoint.guess_can_open(b"not-a-netcdf-file") | ||
# Should not claim .gz files that aren't netCDF | ||
assert not entrypoint.guess_can_open("something.zarr.gz") | ||
assert not entrypoint.guess_can_open("something.tar.gz") | ||
assert not entrypoint.guess_can_open("something.txt.gz") | ||
|
||
|
||
@requires_h5netcdf | ||
|
@@ -7260,6 +7267,77 @@ def test_zarr_entrypoint(tmp_path: Path) -> None: | |
assert not entrypoint.guess_can_open("something.zarr.txt") | ||
|
||
|
||
@requires_h5netcdf | ||
@requires_netCDF4 | ||
@requires_pydap | ||
@requires_zarr | ||
def test_remote_url_backend_auto_detection() -> None: | ||
""" | ||
Test that remote URLs are correctly selected by the backend resolution system. | ||
|
||
This tests the fix for issue where netCDF4, h5netcdf, and pydap backends were | ||
claiming ALL remote URLs, preventing remote Zarr stores from being | ||
auto-detected. | ||
|
||
See: https://github.com/pydata/xarray/issues/10801 | ||
""" | ||
from xarray.backends.plugins import guess_engine | ||
|
||
# Test cases: (url, expected_backend) | ||
test_cases = [ | ||
# Remote Zarr URLs | ||
("https://example.com/store.zarr", "zarr"), | ||
("http://example.com/data.zarr/", "zarr"), | ||
("s3://bucket/path/to/data.zarr", "zarr"), | ||
# Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order, no query params) | ||
("https://example.com/file.nc", "h5netcdf"), | ||
("http://example.com/data.nc4", "h5netcdf"), | ||
("https://example.com/test.cdf", "h5netcdf"), | ||
("s3://bucket/path/to/data.nc", "h5netcdf"), | ||
Comment on lines
+7292
to
+7296
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this will need updating after #10817, sorry! |
||
# Remote netCDF URLs with query params - netcdf4 wins | ||
# Note: Query params are typically indicative of DAP URLs (e.g., OPeNDAP constraint expressions), | ||
# so we prefer netcdf4 (which has DAP support) over h5netcdf (which doesn't) | ||
("https://example.com/data.nc?var=temperature&time=0", "netcdf4"), | ||
( | ||
"http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]", | ||
"netcdf4", | ||
), | ||
# DAP URLs without extensions - pydap wins | ||
("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), | ||
("dap4://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), | ||
("DAP2://example.com/dataset", "pydap"), # uppercase scheme | ||
("DAP4://example.com/dataset", "pydap"), # uppercase scheme | ||
("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path | ||
# DAP URLs with .nc extensions (no query params) - h5netcdf wins (first in order) | ||
("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), | ||
("https://example.com/DAP4/data.nc", "h5netcdf"), | ||
("http://example.com/data/Dap4/file.nc", "h5netcdf"), | ||
] | ||
|
||
for url, expected_backend in test_cases: | ||
engine = guess_engine(url) | ||
assert engine == expected_backend, ( | ||
f"URL {url!r} should select {expected_backend!r} but got {engine!r}" | ||
) | ||
|
||
# URLs that should raise ValueError (no backend can open them) | ||
invalid_urls = [ | ||
"http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix | ||
"https://example.com/data.dap", # .dap suffix | ||
"http://opendap.example.com/data", # no extension, no DAP indicators | ||
"https://test.opendap.org/dataset", # no extension, no DAP indicators | ||
] | ||
|
||
for url in invalid_urls: | ||
try: | ||
engine = guess_engine(url) | ||
raise AssertionError( | ||
f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it" | ||
) | ||
except ValueError: | ||
pass # Expected | ||
Comment on lines
+7332
to
+7338
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
|
||
|
||
@requires_netCDF4 | ||
@pytest.mark.parametrize("str_type", (str, np.str_)) | ||
def test_write_file_from_np_str(str_type: type[str | np.str_], tmpdir: str) -> None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intentionally not stripping any query params that might be present in dap query so that h5netcdf does not claim to be able to open it, as it's my undersstanding that it cannot