Skip to content
12 changes: 11 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

.. _whats-new:

Bug fixes
~~~~~~~~~

- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming
remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators
(``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents
both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit
``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak <https://github.com/ianhi>`_.


What's New
==========

Expand All @@ -27,12 +37,12 @@ Breaking changes

Bug fixes
~~~~~~~~~

- Fix error raised when writing scalar variables to Zarr with ``region={}``
(:pull:`10796`).
By `Stephan Hoyer <https://github.com/shoyer>`_.



.. _whats-new.2025.09.1:

v2025.09.1 (September 29, 2025)
Expand Down
14 changes: 10 additions & 4 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,13 +462,19 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint):
supports_groups = True

def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
from xarray.core.utils import is_remote_uri

filename_or_obj = _normalize_filename_or_obj(filename_or_obj)
magic_number = try_read_magic_number_from_file_or_path(filename_or_obj)
if magic_number is not None:
return magic_number.startswith(b"\211HDF\r\n\032\n")

# Try to read magic number for local files only
is_remote = isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)
if not is_remote:
magic_number = try_read_magic_number_from_file_or_path(filename_or_obj)
if magic_number is not None:
return magic_number.startswith(b"\211HDF\r\n\032\n")

if isinstance(filename_or_obj, str | os.PathLike):
_, ext = os.path.splitext(filename_or_obj)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentionally not stripping any query params that might be present in dap query so that h5netcdf does not claim to be able to open it, as it's my undersstanding that it cannot

_, ext = os.path.splitext(str(filename_or_obj))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need casting with str? the docs for os.path.splitext note that it accepts path-like objects

return ext in {".nc", ".nc4", ".cdf"}

return False
Expand Down
39 changes: 27 additions & 12 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,21 +701,36 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint):
supports_groups = True

def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj):
return True
# Helper to check if magic number is netCDF or HDF5
def _is_netcdf_magic(magic: bytes) -> bool:
return magic.startswith((b"CDF", b"\211HDF\r\n\032\n"))

# Helper to check if extension is netCDF
def _has_netcdf_ext(path: str | os.PathLike, is_remote: bool = False) -> bool:
from xarray.core.utils import strip_uri_params

path = str(path).rstrip("/")
# For remote URIs, strip query parameters and fragments
if is_remote:
path = strip_uri_params(path)
_, ext = os.path.splitext(path)
return ext in {".nc", ".nc4", ".cdf"}

magic_number = (
bytes(filename_or_obj[:8])
if isinstance(filename_or_obj, bytes | memoryview)
else try_read_magic_number_from_path(filename_or_obj)
)
if magic_number is not None:
# netcdf 3 or HDF5
return magic_number.startswith((b"CDF", b"\211HDF\r\n\032\n"))
if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj):
# For remote URIs, check extension (accounting for query params/fragments)
# Remote netcdf-c can handle both regular URLs and DAP URLs
return _has_netcdf_ext(filename_or_obj, is_remote=True)

if isinstance(filename_or_obj, str | os.PathLike):
_, ext = os.path.splitext(filename_or_obj)
return ext in {".nc", ".nc4", ".cdf"}
# For local paths, check magic number first, then extension
magic_number = try_read_magic_number_from_path(filename_or_obj)
if magic_number is not None:
return _is_netcdf_magic(magic_number)
# No magic number available, fallback to extension
return _has_netcdf_ext(filename_or_obj)

if isinstance(filename_or_obj, bytes | memoryview):
return _is_netcdf_magic(bytes(filename_or_obj[:8]))

return False

Expand Down
19 changes: 18 additions & 1 deletion xarray/backends/pydap_.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import os
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any

Expand Down Expand Up @@ -209,7 +210,23 @@ class PydapBackendEntrypoint(BackendEntrypoint):
url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html"

def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool:
return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)
if not isinstance(filename_or_obj, str):
return False

# Check for explicit DAP protocol indicators:
# 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri)
# 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive)
# Note: We intentionally do NOT check for .dap suffix as that would match
# file extensions like .dap which trigger downloads of binary data
url_lower = filename_or_obj.lower()
if url_lower.startswith(("dap2://", "dap4://")):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Mikejmnez is it ok that this will accept both DAP2:// and dap2://?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I tried it and it works with pydap. same for DAP4 v dap4

return True

# For standard remote URIs, check for DAP indicators in path
if is_remote_uri(filename_or_obj):
return "/dap2/" in url_lower or "/dap4/" in url_lower

return False

def open_dataset(
self,
Expand Down
18 changes: 13 additions & 5 deletions xarray/backends/scipy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,12 +330,12 @@ class ScipyBackendEntrypoint(BackendEntrypoint):
"""
Backend for netCDF files based on the scipy package.

It can open ".nc", ".nc4", ".cdf" and ".gz" files but will only be
It can open ".nc", ".cdf", and "nc..gz" files but will only be
selected as the default if the "netcdf4" and "h5netcdf" engines are
not available. It has the advantage that is is a lightweight engine
that has no system requirements (unlike netcdf4 and h5netcdf).

Additionally it can open gizp compressed (".gz") files.
Additionally it can open gzip compressed (".gz") files.

For more information about the underlying library, visit:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.netcdf_file.html
Expand All @@ -347,13 +347,15 @@ class ScipyBackendEntrypoint(BackendEntrypoint):
backends.H5netcdfBackendEntrypoint
"""

description = "Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray"
description = "Open netCDF files (.nc, .cdf and .nc.gz) using scipy in Xarray"
url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html"

def guess_can_open(
self,
filename_or_obj: T_PathFileOrDataStore,
) -> bool:
from xarray.core.utils import is_remote_uri

filename_or_obj = _normalize_filename_or_obj(filename_or_obj)
magic_number = try_read_magic_number_from_file_or_path(filename_or_obj)
if magic_number is not None and magic_number.startswith(b"\x1f\x8b"):
Expand All @@ -363,8 +365,14 @@ def guess_can_open(
return magic_number.startswith(b"CDF")

if isinstance(filename_or_obj, str | os.PathLike):
_, ext = os.path.splitext(filename_or_obj)
return ext in {".nc", ".nc4", ".cdf", ".gz"}
from pathlib import Path

# scipy can only handle local files
if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj):
return False

suffix = "".join(Path(filename_or_obj).suffixes)
return suffix in {".nc", ".cdf", ".nc.gz"}

return False

Expand Down
34 changes: 34 additions & 0 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,40 @@ def is_remote_uri(path: str) -> bool:
return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path))


def strip_uri_params(uri: str) -> str:
"""Strip query parameters and fragments from a URI.

This is useful for extracting the file extension from URLs that
contain query parameters (e.g., OPeNDAP constraint expressions).

Parameters
----------
uri : str
The URI to strip

Returns
-------
str
The URI without query parameters (?) or fragments (#)

Examples
--------
>>> strip_uri_params("http://example.com/file.nc?var=temp&time=0")
'http://example.com/file.nc'
>>> strip_uri_params("http://example.com/file.nc#section")
'http://example.com/file.nc'
>>> strip_uri_params("/local/path/file.nc")
'/local/path/file.nc'
"""
from urllib.parse import urlsplit, urlunsplit

# Use urlsplit to properly parse the URI
# This handles both absolute URLs and relative paths
parsed = urlsplit(uri)
# Reconstruct without query and fragment using urlunsplit
return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", ""))


def read_magic_number_from_file(filename_or_obj, count=8) -> bytes:
# check byte header to determine file type
if not isinstance(filename_or_obj, io.IOBase):
Expand Down
80 changes: 79 additions & 1 deletion xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -7167,7 +7167,10 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None:
_check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds)
_check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds)

assert entrypoint.guess_can_open("http://something/remote")
# Remote URLs without extensions are no longer claimed (stricter detection)
assert not entrypoint.guess_can_open("http://something/remote")
# Remote URLs with netCDF extensions are claimed
assert entrypoint.guess_can_open("http://something/remote.nc")
assert entrypoint.guess_can_open("something-local.nc")
assert entrypoint.guess_can_open("something-local.nc4")
assert entrypoint.guess_can_open("something-local.cdf")
Expand Down Expand Up @@ -7210,6 +7213,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None:
assert entrypoint.guess_can_open("something-local.nc.gz")
assert not entrypoint.guess_can_open("not-found-and-no-extension")
assert not entrypoint.guess_can_open(b"not-a-netcdf-file")
# Should not claim .gz files that aren't netCDF
assert not entrypoint.guess_can_open("something.zarr.gz")
assert not entrypoint.guess_can_open("something.tar.gz")
assert not entrypoint.guess_can_open("something.txt.gz")


@requires_h5netcdf
Expand Down Expand Up @@ -7260,6 +7267,77 @@ def test_zarr_entrypoint(tmp_path: Path) -> None:
assert not entrypoint.guess_can_open("something.zarr.txt")


@requires_h5netcdf
@requires_netCDF4
@requires_pydap
@requires_zarr
def test_remote_url_backend_auto_detection() -> None:
"""
Test that remote URLs are correctly selected by the backend resolution system.

This tests the fix for issue where netCDF4, h5netcdf, and pydap backends were
claiming ALL remote URLs, preventing remote Zarr stores from being
auto-detected.

See: https://github.com/pydata/xarray/issues/10801
"""
from xarray.backends.plugins import guess_engine

# Test cases: (url, expected_backend)
test_cases = [
# Remote Zarr URLs
("https://example.com/store.zarr", "zarr"),
("http://example.com/data.zarr/", "zarr"),
("s3://bucket/path/to/data.zarr", "zarr"),
# Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order, no query params)
("https://example.com/file.nc", "h5netcdf"),
("http://example.com/data.nc4", "h5netcdf"),
("https://example.com/test.cdf", "h5netcdf"),
("s3://bucket/path/to/data.nc", "h5netcdf"),
Comment on lines +7292 to +7296
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will need updating after #10817, sorry!

# Remote netCDF URLs with query params - netcdf4 wins
# Note: Query params are typically indicative of DAP URLs (e.g., OPeNDAP constraint expressions),
# so we prefer netcdf4 (which has DAP support) over h5netcdf (which doesn't)
("https://example.com/data.nc?var=temperature&time=0", "netcdf4"),
(
"http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]",
"netcdf4",
),
# DAP URLs without extensions - pydap wins
("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"),
("dap4://opendap.earthdata.nasa.gov/collections/dataset", "pydap"),
("DAP2://example.com/dataset", "pydap"), # uppercase scheme
("DAP4://example.com/dataset", "pydap"), # uppercase scheme
("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path
# DAP URLs with .nc extensions (no query params) - h5netcdf wins (first in order)
("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"),
("https://example.com/DAP4/data.nc", "h5netcdf"),
("http://example.com/data/Dap4/file.nc", "h5netcdf"),
]

for url, expected_backend in test_cases:
engine = guess_engine(url)
assert engine == expected_backend, (
f"URL {url!r} should select {expected_backend!r} but got {engine!r}"
)

# URLs that should raise ValueError (no backend can open them)
invalid_urls = [
"http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix
"https://example.com/data.dap", # .dap suffix
"http://opendap.example.com/data", # no extension, no DAP indicators
"https://test.opendap.org/dataset", # no extension, no DAP indicators
]

for url in invalid_urls:
try:
engine = guess_engine(url)
raise AssertionError(
f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it"
)
except ValueError:
pass # Expected
Comment on lines +7332 to +7338
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use pytest.raises(ValueError) for this sort of thing. Ideally, also check the error messages to make sure it's the right error being raised



@requires_netCDF4
@pytest.mark.parametrize("str_type", (str, np.str_))
def test_write_file_from_np_str(str_type: type[str | np.str_], tmpdir: str) -> None:
Expand Down
Loading