diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index f56b2bc1d1c..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -37,7 +37,6 @@ dependencies: - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - - pydap-server - pytest - pytest-asyncio - pytest-cov diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7093c0043f..c2435db8a1c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,9 @@ v2025.10.1 (unreleased) New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -41,6 +44,7 @@ v2025.10.0 (October 6, 2025) This release reverts a breaking change to Xarray's preferred netCDF backend. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4fbfe8ee210..b29b92abdeb 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,8 +35,10 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array): + def __init__(self, array, batch=None, checksums=True): self.array = array + self._batch = batch + self._checksums = checksums @property def shape(self) -> tuple[int, ...]: @@ -52,13 +54,19 @@ def __getitem__(self, key): ) def _getitem(self, key): - result = robust_getitem(self.array, key, catch=ValueError) - # in some cases, pydap doesn't squeeze axes automatically like numpy - result = np.asarray(result) - axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) - if result.ndim + len(axis) != self.array.ndim and axis: - result = np.squeeze(result, axis) + if self._batch and hasattr(self.array, "dataset"): + # True only for pydap>3.5.5 + from pydap.client import data_check, get_batch_data + dataset = self.array.dataset + get_batch_data(self.array, checksums=self._checksums, key=key) + result = data_check(np.asarray(dataset[self.array.id].data), key) + else: + result = robust_getitem(self.array, key, catch=ValueError) + result = np.asarray(result.data) + axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + if result.ndim + len(axis) != self.array.ndim and axis: + result = np.squeeze(result, axis) return result @@ -81,7 +89,15 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None): + def __init__( + self, + dataset, + group=None, + session=None, + batch=None, + protocol=None, + checksums=True, + ): """ Parameters ---------- @@ -91,6 +107,9 @@ def __init__(self, dataset, group=None): """ self.dataset = dataset self.group = group + self._batch = batch + self._protocol = protocol + self._checksums = checksums # true by default @classmethod def open( @@ -103,6 +122,8 @@ def open( timeout=None, verify=None, user_charset=None, + batch=None, + checksums=True, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -117,6 +138,7 @@ def open( DeprecationWarning, ) output_grid = False # new default behavior + kwargs = { "url": url, "application": application, @@ -132,22 +154,45 @@ def open( elif hasattr(url, "ds"): # pydap dataset dataset = url.ds - args = {"dataset": dataset} + args = {"dataset": dataset, "checksums": checksums} if group: - # only then, change the default args["group"] = group + if url.startswith(("http", "dap2")): + args["protocol"] = "dap2" + elif url.startswith("dap4"): + args["protocol"] = "dap4" + if batch: + args["batch"] = batch return cls(**args) def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) - try: + if hasattr(var, "dims"): dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims ] - except AttributeError: + else: # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions + if ( + self._protocol == "dap4" + and var.name in dimensions + and hasattr(var, "dataset") # only True for pydap>3.5.5 + ): + if not var.dataset._batch_mode: + # for dap4, always batch all dimensions at once + var.dataset.enable_batch_mode() + data_array = self._get_data_array(var) + data = indexing.LazilyIndexedArray(data_array) + if not self._batch and var.dataset._batch_mode: + # if `batch=False``, restore it for all other variables + var.dataset.disable_batch_mode() + else: + # all non-dimension variables + data = indexing.LazilyIndexedArray( + PydapArrayWrapper(var, self._batch, self._checksums) + ) + return Variable(dimensions, data, var.attributes) def get_variables(self): @@ -165,6 +210,7 @@ def get_variables(self): # check the key is not a BaseType or GridType if not isinstance(self.ds[var], GroupType) ] + return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) def get_attrs(self): @@ -176,18 +222,30 @@ def get_attrs(self): "libdap", "invocation", "dimensions", + "path", + "Maps", ) - attrs = self.ds.attributes - list(map(attrs.pop, opendap_attrs, [None] * 6)) + attrs = dict(self.ds.attributes) + list(map(attrs.pop, opendap_attrs, [None] * 8)) return Frozen(attrs) def get_dimensions(self): - return Frozen(self.ds.dimensions) + return Frozen(sorted(self.ds.dimensions)) @property def ds(self): return get_group(self.dataset, self.group) + def _get_data_array(self, var): + """gets dimension data all at once""" + from pydap.client import get_batch_data + + if not var._is_data_loaded(): + # data has not been deserialized yet + # runs only once per store/hierarchy + get_batch_data(var, checksums=self._checksums) + return self.dataset[var.id].data + class PydapBackendEntrypoint(BackendEntrypoint): """ @@ -231,6 +289,8 @@ def open_dataset( timeout=None, verify=None, user_charset=None, + batch=None, + checksums=True, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -241,6 +301,8 @@ def open_dataset( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, + checksums=checksums, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): @@ -273,6 +335,8 @@ def open_datatree( timeout=None, verify=None, user_charset=None, + batch=None, + checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -285,10 +349,12 @@ def open_datatree( decode_timedelta=decode_timedelta, group=group, application=None, - session=None, - timeout=None, - verify=None, - user_charset=None, + session=session, + timeout=timeout, + verify=application, + user_charset=user_charset, + batch=batch, + checksums=checksums, ) return datatree_from_dict_with_io_cleanup(groups_dict) @@ -310,6 +376,8 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, + batch=None, + checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -321,6 +389,8 @@ def open_groups_as_dict( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, + checksums=checksums, ) # Check for a group and make it a parent if it exists diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 996644e5c16..ac422fb04bc 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6473,6 +6473,58 @@ def test_session(self) -> None: ) +@requires_pydap +@network +@pytest.mark.parametrize("protocol", ["dap2", "dap4"]) +@pytest.mark.parametrize("batch", [False, True]) +def test_batchdap4_downloads(tmpdir, protocol, batch) -> None: + """Test that in dap4, all dimensions are downloaded at once""" + import pydap + from pydap.net import create_session + + _version_ = Version(pydap.__version__) + # Create a session with pre-set params in pydap backend, to cache urls + cache_name = tmpdir / "debug" + session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name}) + session.cache.clear() + url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + + if protocol == "dap4": + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + batch=batch, + ) + if _version_ > Version("3.5.5"): + # total downloads are: + # 1 dmr + 1 dap (dimensions) + assert len(session.cache.urls()) == 2 + # now load the rest of the variables + ds.load() + if batch: + # all non-dimensions are downloaded in a single https requests + assert len(session.cache.urls()) == 2 + 1 + if not batch: + # each non-dimension array is downloaded with an individual + # https requests + assert len(session.cache.urls()) == 2 + 4 + else: + assert len(session.cache.urls()) == 4 + ds.load() + assert len(session.cache.urls()) == 4 + 4 + elif protocol == "dap2": + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) + # das + dds + 3 dods urls + assert len(session.cache.urls()) == 5 + + class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 6b15e74c2e9..fc58db1b6e8 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from packaging.version import Version import xarray as xr from xarray import DataTree, load_datatree, open_datatree, open_groups @@ -613,7 +614,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None: ) as expected: assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) - def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: + def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None: """Test that `open_datatree` inherits coordinates from root tree. This particular h5 file is a test file that inherits the time coordinate from the root @@ -639,7 +640,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: │ Temperature (time, Z, Y, X) float32 ... | Salinity (time, Z, Y, X) float32 ... """ - tree = open_datatree(url, engine=self.engine) + import pydap + from pydap.net import create_session + + # Create a session with pre-set retry params in pydap backend, to cache urls + cache_name = tmpdir / "debug" + session = create_session( + use_cache=True, cache_kwargs={"cache_name": cache_name} + ) + session.cache.clear() + + _version_ = Version(pydap.__version__) + + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) @@ -650,6 +663,13 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) + if _version_ > Version("3.5.5"): + # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups per group + assert len(session.cache.urls()) == 3 + else: + # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) + assert len(session.cache.urls()) == 5 + def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None: aligned_dict_of_datasets = open_groups(url, engine=self.engine) aligned_dt = DataTree.from_dict(aligned_dict_of_datasets)