Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
130566c
update PydapArrayWrapper to support backend batching
Mikejmnez Aug 12, 2025
6e5e2bd
update PydapDataStore to use backend logic in dap4 to batch variables…
Mikejmnez Aug 12, 2025
3b66103
pydap-server it not necessary
Mikejmnez Aug 12, 2025
6e517f7
set `batch=False` as default
Mikejmnez Aug 12, 2025
2dfd4ee
set `batch=False` as default in datatree
Mikejmnez Aug 12, 2025
b6304d0
set `batch=False` as default in open groups as dict
Mikejmnez Aug 12, 2025
5d6af4a
for flaky, install pydap from repo for now
Mikejmnez Aug 12, 2025
ef1fca0
initial tests - quantify cached url
Mikejmnez Aug 13, 2025
fe6d0aa
adds tests to datatree backend to assert multiple dimensions download…
Mikejmnez Aug 13, 2025
ba67bdb
update testing to show number of download urls
Mikejmnez Aug 13, 2025
d379f2d
simplified logic
Mikejmnez Aug 13, 2025
4d9b85d
specify cached session debug name to actually cache urls
Mikejmnez Aug 13, 2025
c8a6d72
fix for mypy
Mikejmnez Aug 13, 2025
6b21bef
user visible changes on `whats-new.rst`
Mikejmnez Aug 13, 2025
284ee1d
impose sorted to `get_dimensions` method
Mikejmnez Aug 13, 2025
1249ef1
reformat `whats-new.rst`
Mikejmnez Aug 13, 2025
4ec9b73
revert to install pydap from conda and not from repo
Mikejmnez Aug 13, 2025
20e64f1
expose checksum as user kwarg
Mikejmnez Aug 13, 2025
842728f
include `checksums` optional argument in `whats-new`
Mikejmnez Aug 13, 2025
476aa46
update to newest release of pydap via pip until conda install is avai…
Mikejmnez Aug 13, 2025
c9786f4
use requests_cache session with retry-params when 500 errors occur
Mikejmnez Aug 13, 2025
d4dd68d
update env yml file to use new pydap release via conda
Mikejmnez Aug 14, 2025
06ee7b4
let `pydap` handle exceptions/warning
Mikejmnez Aug 25, 2025
2d9c4a6
process dims at once, one per group
Mikejmnez Sep 5, 2025
53e1b82
debug
Mikejmnez Sep 9, 2025
fb314bd
revert what`s new from previous commit
Mikejmnez Sep 18, 2025
adc6ff5
enable data checker for batched deserialized data
Mikejmnez Sep 18, 2025
e43459b
temporarily install from source for testing - will revert to conda in…
Mikejmnez Sep 18, 2025
742ecba
update `whats new`
Mikejmnez Sep 18, 2025
78a5c4b
update tests
Mikejmnez Sep 18, 2025
0caa288
set `batch=None` as default
Mikejmnez Sep 18, 2025
20c4186
improve handling of dims vs dimensions deprecation warning
Mikejmnez Sep 19, 2025
d2e505b
update to use latest version of pydap
Mikejmnez Sep 26, 2025
85ec17b
update import
Mikejmnez Sep 26, 2025
bf30ed3
update `whats new docs
Mikejmnez Sep 26, 2025
01fc07c
move cache session to `tmpdir`
Mikejmnez Sep 26, 2025
87092d7
remove added functionality from whats new from newly released version
Mikejmnez Sep 30, 2025
9b8b4a6
add to `whats-new` for next release
Mikejmnez Sep 30, 2025
9d819a2
update `whats new` to describe changes in this PR
Mikejmnez Oct 6, 2025
b0826fe
fix double entry on whats new
Mikejmnez Oct 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/requirements/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ dependencies:
- pre-commit
- pyarrow # pandas raises a deprecation warning without this, breaking doctests
- pydap
- pydap-server
- pytest
- pytest-asyncio
- pytest-cov
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ v2025.10.1 (unreleased)
New Features
~~~~~~~~~~~~

- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`).
``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me undestand -- why would a user not want to enable batch mode if they are using a new enough version of pydap?

By `Miguel Jimenez-Urias <https://github.com/Mikejmnez>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -41,6 +44,7 @@ v2025.10.0 (October 6, 2025)

This release reverts a breaking change to Xarray's preferred netCDF backend.


Breaking changes
~~~~~~~~~~~~~~~~

Expand Down
110 changes: 90 additions & 20 deletions xarray/backends/pydap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@


class PydapArrayWrapper(BackendArray):
def __init__(self, array):
def __init__(self, array, batch=None, checksums=True):
self.array = array
self._batch = batch
self._checksums = checksums

@property
def shape(self) -> tuple[int, ...]:
Expand All @@ -52,13 +54,19 @@ def __getitem__(self, key):
)

def _getitem(self, key):
result = robust_getitem(self.array, key, catch=ValueError)
# in some cases, pydap doesn't squeeze axes automatically like numpy
result = np.asarray(result)
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
if result.ndim + len(axis) != self.array.ndim and axis:
result = np.squeeze(result, axis)
if self._batch and hasattr(self.array, "dataset"):
# True only for pydap>3.5.5
from pydap.client import data_check, get_batch_data

dataset = self.array.dataset
get_batch_data(self.array, checksums=self._checksums, key=key)
result = data_check(np.asarray(dataset[self.array.id].data), key)
else:
result = robust_getitem(self.array, key, catch=ValueError)
result = np.asarray(result.data)
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
if result.ndim + len(axis) != self.array.ndim and axis:
result = np.squeeze(result, axis)
return result


Expand All @@ -81,7 +89,15 @@ class PydapDataStore(AbstractDataStore):
be useful if the netCDF4 library is not available.
"""

def __init__(self, dataset, group=None):
def __init__(
self,
dataset,
group=None,
session=None,
batch=None,
protocol=None,
checksums=True,
):
"""
Parameters
----------
Expand All @@ -91,6 +107,9 @@ def __init__(self, dataset, group=None):
"""
self.dataset = dataset
self.group = group
self._batch = batch
self._protocol = protocol
self._checksums = checksums # true by default

@classmethod
def open(
Expand All @@ -103,6 +122,8 @@ def open(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
):
from pydap.client import open_url
from pydap.net import DEFAULT_TIMEOUT
Expand All @@ -117,6 +138,7 @@ def open(
DeprecationWarning,
)
output_grid = False # new default behavior

kwargs = {
"url": url,
"application": application,
Expand All @@ -132,22 +154,45 @@ def open(
elif hasattr(url, "ds"):
# pydap dataset
dataset = url.ds
args = {"dataset": dataset}
args = {"dataset": dataset, "checksums": checksums}
if group:
# only then, change the default
args["group"] = group
if url.startswith(("http", "dap2")):
args["protocol"] = "dap2"
elif url.startswith("dap4"):
args["protocol"] = "dap4"
if batch:
args["batch"] = batch
return cls(**args)

def open_store_variable(self, var):
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
try:
if hasattr(var, "dims"):
dimensions = [
dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims
]
except AttributeError:
else:
# GridType does not have a dims attribute - instead get `dimensions`
# see https://github.com/pydap/pydap/issues/485
dimensions = var.dimensions
if (
self._protocol == "dap4"
and var.name in dimensions
and hasattr(var, "dataset") # only True for pydap>3.5.5
):
if not var.dataset._batch_mode:
# for dap4, always batch all dimensions at once
var.dataset.enable_batch_mode()
data_array = self._get_data_array(var)
data = indexing.LazilyIndexedArray(data_array)
if not self._batch and var.dataset._batch_mode:
# if `batch=False``, restore it for all other variables
var.dataset.disable_batch_mode()
else:
# all non-dimension variables
data = indexing.LazilyIndexedArray(
PydapArrayWrapper(var, self._batch, self._checksums)
)

return Variable(dimensions, data, var.attributes)

def get_variables(self):
Expand All @@ -165,6 +210,7 @@ def get_variables(self):
# check the key is not a BaseType or GridType
if not isinstance(self.ds[var], GroupType)
]

return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars)

def get_attrs(self):
Expand All @@ -176,18 +222,30 @@ def get_attrs(self):
"libdap",
"invocation",
"dimensions",
"path",
"Maps",
)
attrs = self.ds.attributes
list(map(attrs.pop, opendap_attrs, [None] * 6))
attrs = dict(self.ds.attributes)
list(map(attrs.pop, opendap_attrs, [None] * 8))
return Frozen(attrs)

def get_dimensions(self):
return Frozen(self.ds.dimensions)
return Frozen(sorted(self.ds.dimensions))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To potentially address the issues with dimensions in Datatree, and the lat/lon dimensions being inconsistently ordered, I added this sorted to the dimensions list that the backend gets from the Pydap dataset directly. Hopefully this little fix will make it go away, but I will continue checking this issue locally and after merging main into this PR (it has not failed once yet! knocks on wood)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only dataset level dimensions, not variable level dimensions.

At the dataset level, dimension order doesn't really matter, so I doubt this is going to fix the issue, unfortunately.


@property
def ds(self):
return get_group(self.dataset, self.group)

def _get_data_array(self, var):
"""gets dimension data all at once"""
from pydap.client import get_batch_data

if not var._is_data_loaded():
# data has not been deserialized yet
# runs only once per store/hierarchy
get_batch_data(var, checksums=self._checksums)
return self.dataset[var.id].data


class PydapBackendEntrypoint(BackendEntrypoint):
"""
Expand Down Expand Up @@ -231,6 +289,8 @@ def open_dataset(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
) -> Dataset:
store = PydapDataStore.open(
url=filename_or_obj,
Expand All @@ -241,6 +301,8 @@ def open_dataset(
timeout=timeout,
verify=verify,
user_charset=user_charset,
batch=batch,
checksums=checksums,
)
store_entrypoint = StoreBackendEntrypoint()
with close_on_error(store):
Expand Down Expand Up @@ -273,6 +335,8 @@ def open_datatree(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
) -> DataTree:
groups_dict = self.open_groups_as_dict(
filename_or_obj,
Expand All @@ -285,10 +349,12 @@ def open_datatree(
decode_timedelta=decode_timedelta,
group=group,
application=None,
session=None,
timeout=None,
verify=None,
user_charset=None,
session=session,
timeout=timeout,
verify=application,
user_charset=user_charset,
batch=batch,
checksums=checksums,
)

return datatree_from_dict_with_io_cleanup(groups_dict)
Expand All @@ -310,6 +376,8 @@ def open_groups_as_dict(
timeout=None,
verify=None,
user_charset=None,
batch=None,
checksums=True,
) -> dict[str, Dataset]:
from xarray.core.treenode import NodePath

Expand All @@ -321,6 +389,8 @@ def open_groups_as_dict(
timeout=timeout,
verify=verify,
user_charset=user_charset,
batch=batch,
checksums=checksums,
)

# Check for a group and make it a parent if it exists
Expand Down
52 changes: 52 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -6473,6 +6473,58 @@ def test_session(self) -> None:
)


@requires_pydap
@network
@pytest.mark.parametrize("protocol", ["dap2", "dap4"])
@pytest.mark.parametrize("batch", [False, True])
def test_batchdap4_downloads(tmpdir, protocol, batch) -> None:
"""Test that in dap4, all dimensions are downloaded at once"""
import pydap
from pydap.net import create_session

_version_ = Version(pydap.__version__)
# Create a session with pre-set params in pydap backend, to cache urls
cache_name = tmpdir / "debug"
session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name})
session.cache.clear()
url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc"

if protocol == "dap4":
ds = open_dataset(
url.replace("https", protocol),
engine="pydap",
session=session,
decode_times=False,
batch=batch,
)
if _version_ > Version("3.5.5"):
# total downloads are:
# 1 dmr + 1 dap (dimensions)
assert len(session.cache.urls()) == 2
# now load the rest of the variables
ds.load()
if batch:
# all non-dimensions are downloaded in a single https requests
assert len(session.cache.urls()) == 2 + 1
if not batch:
# each non-dimension array is downloaded with an individual
# https requests
assert len(session.cache.urls()) == 2 + 4
else:
assert len(session.cache.urls()) == 4
ds.load()
assert len(session.cache.urls()) == 4 + 4
elif protocol == "dap2":
ds = open_dataset(
url.replace("https", protocol),
engine="pydap",
session=session,
decode_times=False,
)
# das + dds + 3 dods urls
assert len(session.cache.urls()) == 5


class TestEncodingInvalid:
def test_extract_nc4_variable_encoding(self) -> None:
var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"})
Expand Down
24 changes: 22 additions & 2 deletions xarray/tests/test_backends_datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import numpy as np
import pytest
from packaging.version import Version

import xarray as xr
from xarray import DataTree, load_datatree, open_datatree, open_groups
Expand Down Expand Up @@ -613,7 +614,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None:
) as expected:
assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected)

def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None:
"""Test that `open_datatree` inherits coordinates from root tree.

This particular h5 file is a test file that inherits the time coordinate from the root
Expand All @@ -639,7 +640,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
│ Temperature (time, Z, Y, X) float32 ...
| Salinity (time, Z, Y, X) float32 ...
"""
tree = open_datatree(url, engine=self.engine)
import pydap
from pydap.net import create_session

# Create a session with pre-set retry params in pydap backend, to cache urls
cache_name = tmpdir / "debug"
session = create_session(
use_cache=True, cache_kwargs={"cache_name": cache_name}
)
session.cache.clear()

_version_ = Version(pydap.__version__)

tree = open_datatree(url, engine=self.engine, session=session)
assert set(tree.dims) == {"time", "Z", "nv"}
assert tree["/SimpleGroup"].coords["time"].dims == ("time",)
assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",)
Expand All @@ -650,6 +663,13 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
list(expected.dims) + ["Z", "nv"]
)

if _version_ > Version("3.5.5"):
# Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups per group
assert len(session.cache.urls()) == 3
else:
# 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays)
assert len(session.cache.urls()) == 5

def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None:
aligned_dict_of_datasets = open_groups(url, engine=self.engine)
aligned_dt = DataTree.from_dict(aligned_dict_of_datasets)
Expand Down
Loading