-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
[pydap backend] enables downloading/processing multiple arrays within single http request #10629
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
130566c
6e5e2bd
3b66103
6e517f7
2dfd4ee
b6304d0
5d6af4a
ef1fca0
fe6d0aa
ba67bdb
d379f2d
4d9b85d
c8a6d72
6b21bef
284ee1d
1249ef1
4ec9b73
20e64f1
842728f
476aa46
c9786f4
d4dd68d
06ee7b4
2d9c4a6
53e1b82
fb314bd
adc6ff5
e43459b
742ecba
78a5c4b
0caa288
20c4186
d2e505b
85ec17b
bf30ed3
01fc07c
87092d7
9b8b4a6
9d819a2
b0826fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,8 +35,10 @@ | |
|
||
|
||
class PydapArrayWrapper(BackendArray): | ||
def __init__(self, array): | ||
def __init__(self, array, batch=None, checksums=True): | ||
self.array = array | ||
self._batch = batch | ||
self._checksums = checksums | ||
|
||
@property | ||
def shape(self) -> tuple[int, ...]: | ||
|
@@ -52,13 +54,19 @@ def __getitem__(self, key): | |
) | ||
|
||
def _getitem(self, key): | ||
result = robust_getitem(self.array, key, catch=ValueError) | ||
# in some cases, pydap doesn't squeeze axes automatically like numpy | ||
result = np.asarray(result) | ||
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) | ||
if result.ndim + len(axis) != self.array.ndim and axis: | ||
result = np.squeeze(result, axis) | ||
if self._batch and hasattr(self.array, "dataset"): | ||
# True only for pydap>3.5.5 | ||
from pydap.client import data_check, get_batch_data | ||
|
||
dataset = self.array.dataset | ||
get_batch_data(self.array, checksums=self._checksums, key=key) | ||
result = data_check(np.asarray(dataset[self.array.id].data), key) | ||
else: | ||
result = robust_getitem(self.array, key, catch=ValueError) | ||
result = np.asarray(result.data) | ||
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) | ||
if result.ndim + len(axis) != self.array.ndim and axis: | ||
result = np.squeeze(result, axis) | ||
return result | ||
|
||
|
||
|
@@ -81,7 +89,15 @@ class PydapDataStore(AbstractDataStore): | |
be useful if the netCDF4 library is not available. | ||
""" | ||
|
||
def __init__(self, dataset, group=None): | ||
def __init__( | ||
self, | ||
dataset, | ||
group=None, | ||
session=None, | ||
batch=None, | ||
protocol=None, | ||
checksums=True, | ||
): | ||
""" | ||
Parameters | ||
---------- | ||
|
@@ -91,6 +107,9 @@ def __init__(self, dataset, group=None): | |
""" | ||
self.dataset = dataset | ||
self.group = group | ||
self._batch = batch | ||
self._protocol = protocol | ||
self._checksums = checksums # true by default | ||
|
||
@classmethod | ||
def open( | ||
|
@@ -103,6 +122,8 @@ def open( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=None, | ||
checksums=True, | ||
): | ||
from pydap.client import open_url | ||
from pydap.net import DEFAULT_TIMEOUT | ||
|
@@ -117,6 +138,7 @@ def open( | |
DeprecationWarning, | ||
) | ||
output_grid = False # new default behavior | ||
|
||
kwargs = { | ||
"url": url, | ||
"application": application, | ||
|
@@ -132,22 +154,45 @@ def open( | |
elif hasattr(url, "ds"): | ||
# pydap dataset | ||
dataset = url.ds | ||
args = {"dataset": dataset} | ||
args = {"dataset": dataset, "checksums": checksums} | ||
if group: | ||
# only then, change the default | ||
args["group"] = group | ||
if url.startswith(("http", "dap2")): | ||
args["protocol"] = "dap2" | ||
elif url.startswith("dap4"): | ||
args["protocol"] = "dap4" | ||
if batch: | ||
args["batch"] = batch | ||
return cls(**args) | ||
|
||
def open_store_variable(self, var): | ||
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) | ||
try: | ||
if hasattr(var, "dims"): | ||
dimensions = [ | ||
dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims | ||
] | ||
except AttributeError: | ||
else: | ||
# GridType does not have a dims attribute - instead get `dimensions` | ||
# see https://github.com/pydap/pydap/issues/485 | ||
dimensions = var.dimensions | ||
if ( | ||
self._protocol == "dap4" | ||
and var.name in dimensions | ||
and hasattr(var, "dataset") # only True for pydap>3.5.5 | ||
): | ||
if not var.dataset._batch_mode: | ||
# for dap4, always batch all dimensions at once | ||
var.dataset.enable_batch_mode() | ||
data_array = self._get_data_array(var) | ||
data = indexing.LazilyIndexedArray(data_array) | ||
if not self._batch and var.dataset._batch_mode: | ||
# if `batch=False``, restore it for all other variables | ||
var.dataset.disable_batch_mode() | ||
else: | ||
# all non-dimension variables | ||
data = indexing.LazilyIndexedArray( | ||
PydapArrayWrapper(var, self._batch, self._checksums) | ||
) | ||
|
||
return Variable(dimensions, data, var.attributes) | ||
|
||
def get_variables(self): | ||
|
@@ -165,6 +210,7 @@ def get_variables(self): | |
# check the key is not a BaseType or GridType | ||
if not isinstance(self.ds[var], GroupType) | ||
] | ||
|
||
return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) | ||
|
||
def get_attrs(self): | ||
|
@@ -176,18 +222,30 @@ def get_attrs(self): | |
"libdap", | ||
"invocation", | ||
"dimensions", | ||
"path", | ||
"Maps", | ||
) | ||
attrs = self.ds.attributes | ||
list(map(attrs.pop, opendap_attrs, [None] * 6)) | ||
attrs = dict(self.ds.attributes) | ||
list(map(attrs.pop, opendap_attrs, [None] * 8)) | ||
return Frozen(attrs) | ||
|
||
def get_dimensions(self): | ||
return Frozen(self.ds.dimensions) | ||
return Frozen(sorted(self.ds.dimensions)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To potentially address the issues with dimensions in Datatree, and the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only dataset level dimensions, not variable level dimensions. At the dataset level, dimension order doesn't really matter, so I doubt this is going to fix the issue, unfortunately. |
||
|
||
@property | ||
def ds(self): | ||
return get_group(self.dataset, self.group) | ||
|
||
def _get_data_array(self, var): | ||
"""gets dimension data all at once""" | ||
from pydap.client import get_batch_data | ||
|
||
if not var._is_data_loaded(): | ||
# data has not been deserialized yet | ||
# runs only once per store/hierarchy | ||
get_batch_data(var, checksums=self._checksums) | ||
return self.dataset[var.id].data | ||
|
||
|
||
class PydapBackendEntrypoint(BackendEntrypoint): | ||
""" | ||
|
@@ -231,6 +289,8 @@ def open_dataset( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=None, | ||
checksums=True, | ||
) -> Dataset: | ||
store = PydapDataStore.open( | ||
url=filename_or_obj, | ||
|
@@ -241,6 +301,8 @@ def open_dataset( | |
timeout=timeout, | ||
verify=verify, | ||
user_charset=user_charset, | ||
batch=batch, | ||
checksums=checksums, | ||
) | ||
store_entrypoint = StoreBackendEntrypoint() | ||
with close_on_error(store): | ||
|
@@ -273,6 +335,8 @@ def open_datatree( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=None, | ||
checksums=True, | ||
) -> DataTree: | ||
groups_dict = self.open_groups_as_dict( | ||
filename_or_obj, | ||
|
@@ -285,10 +349,12 @@ def open_datatree( | |
decode_timedelta=decode_timedelta, | ||
group=group, | ||
application=None, | ||
session=None, | ||
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
session=session, | ||
timeout=timeout, | ||
verify=application, | ||
user_charset=user_charset, | ||
batch=batch, | ||
checksums=checksums, | ||
) | ||
|
||
return datatree_from_dict_with_io_cleanup(groups_dict) | ||
|
@@ -310,6 +376,8 @@ def open_groups_as_dict( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=None, | ||
checksums=True, | ||
) -> dict[str, Dataset]: | ||
from xarray.core.treenode import NodePath | ||
|
||
|
@@ -321,6 +389,8 @@ def open_groups_as_dict( | |
timeout=timeout, | ||
verify=verify, | ||
user_charset=user_charset, | ||
batch=batch, | ||
checksums=checksums, | ||
) | ||
|
||
# Check for a group and make it a parent if it exists | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you help me undestand -- why would a user not want to enable
batch
mode if they are using a new enough version of pydap?