Skip to content

Commit d6b404d

Browse files
sharkinsspatialmaxrjonespre-commit-ci[bot]
authored
Use ManifesStore to_virtual_variable directly in HDFVirtualBackend. (#542)
* Use ManifesStore to_virtual_variable directory in HDFVirtualBackend. * Update HDFVirtualBackend tests to reflect changes to internals. * Simplify _FillValue encoding logic. * Add default_object_store internal func * Remove TODO because lack of duplicate keys are useful * Also return prefix * Remove config complexity * Make _find_matching_store a method * Test default store creation with HDF5 reader * Improve typing * Protect against duplicate config options * Mark minio tests * Fix test * Specify that other schemes aren't supported * Make mypy pass * Revise codecov config * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add http support for default_object_store. * Re-include relative path normalization. * Typing fixes. * Include obstore as hdf backend dependency. * Pin obstore minimum version. Co-authored-by: Max Jones <[email protected]> * Undo test parallelization changes. Co-authored-by: Max Jones <[email protected]> * Spellcheck. Co-authored-by: Max Jones <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Spellcheck. Co-authored-by: Max Jones <[email protected]> * Include comment about current big endian dtype support. * Pass indexes arg through to virtual dataset creation. --------- Co-authored-by: Max Jones <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 259ddeb commit d6b404d

File tree

6 files changed

+141
-237
lines changed

6 files changed

+141
-237
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ hdf = [
5050
"hdf5plugin",
5151
"imagecodecs",
5252
"imagecodecs-numcodecs==2024.6.1",
53+
"obstore>=0.5.1",
5354
]
5455

5556
# kerchunk-based readers

virtualizarr/manifests/store.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ def default_object_store(filepath: str) -> ObjectStore:
150150
virtual_hosted_style_request=False,
151151
region=_find_bucket_region(bucket),
152152
)
153-
153+
if parsed.scheme in ["http", "https"]:
154+
base_url = f"{parsed.scheme}://{parsed.netloc}"
155+
return obs.store.HTTPStore.from_url(base_url)
154156
raise NotImplementedError(f"{parsed.scheme} is not yet supported")
155157

156158

virtualizarr/readers/hdf/hdf.py

Lines changed: 66 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44
from pathlib import Path
55
from typing import (
66
TYPE_CHECKING,
7-
Any,
8-
Dict,
9-
Hashable,
107
Iterable,
118
List,
129
Mapping,
@@ -31,13 +28,9 @@
3128
from virtualizarr.manifests.store import ObjectStoreRegistry, default_object_store
3229
from virtualizarr.manifests.utils import create_v3_array_metadata
3330
from virtualizarr.readers.api import VirtualBackend
34-
from virtualizarr.readers.hdf.filters import cfcodec_from_dataset, codecs_from_dataset
31+
from virtualizarr.readers.hdf.filters import codecs_from_dataset
3532
from virtualizarr.types import ChunkKey
36-
from virtualizarr.utils import _FsspecFSFromFilepath, soft_import
37-
from virtualizarr.xarray import (
38-
construct_fully_virtual_dataset,
39-
construct_virtual_dataset,
40-
)
33+
from virtualizarr.utils import soft_import
4134

4235
h5py = soft_import("h5py", "For reading hdf files", strict=False)
4336

@@ -88,6 +81,22 @@ def _construct_manifest_array(
8881
attrs = HDFVirtualBackend._extract_attrs(dataset)
8982
dtype = dataset.dtype
9083

84+
# Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
85+
# cfcodec = cfcodec_from_dataset(dataset)
86+
# if cfcodec:
87+
# codecs.insert(0, cfcodec["codec"])
88+
# dtype = cfcodec["target_dtype"]
89+
# attrs.pop("scale_factor", None)
90+
# attrs.pop("add_offset", None)
91+
# else:
92+
# dtype = dataset.dtype
93+
94+
if "_FillValue" in attrs:
95+
encoded_cf_fill_value = HDFVirtualBackend._encode_cf_fill_value(
96+
attrs["_FillValue"], dtype
97+
)
98+
attrs["_FillValue"] = encoded_cf_fill_value
99+
91100
codec_configs = [
92101
numcodec_config_to_configurable(codec.get_config()) for codec in codecs
93102
]
@@ -113,7 +122,7 @@ def _construct_manifest_group(
113122
filepath: str,
114123
*,
115124
group: str | None = None,
116-
drop_variables: Optional[List[str]] = None,
125+
drop_variables: Optional[Iterable[str]] = None,
117126
) -> ManifestGroup:
118127
"""
119128
Construct a virtual Group from a HDF dataset.
@@ -136,11 +145,16 @@ def _construct_manifest_group(
136145
group_name = "/"
137146

138147
manifest_dict = {}
139-
non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
140-
group=g
148+
# Several of our test fixtures which use xr.tutorial data have
149+
# non coord dimensions serialized using big endian dtypes which are not
150+
# yet supported in zarr-python v3. We'll drop these variables for the
151+
# moment until big endian support is included upstream.)
152+
153+
non_coordinate_dimension_vars = (
154+
HDFVirtualBackend._find_non_coord_dimension_vars(group=g)
141155
)
142-
drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
143-
attrs: dict[str, Any] = {}
156+
drop_variables = list(set(list(drop_variables) + non_coordinate_dimension_vars))
157+
attrs = HDFVirtualBackend._extract_attrs(g)
144158
for key in g.keys():
145159
if key not in drop_variables:
146160
if isinstance(g[key], h5py.Dataset):
@@ -159,12 +173,16 @@ def _create_manifest_store(
159173
*,
160174
store: ObjectStore | None = None,
161175
group: str | None = None,
176+
drop_variables: Iterable[str] | None = None,
162177
) -> ManifestStore:
163178
# Create a group containing dataset level metadata and all the manifest arrays
164179
if not store:
165180
store = default_object_store(filepath) # type: ignore
166181
manifest_group = HDFVirtualBackend._construct_manifest_group(
167-
store=store, filepath=filepath, group=group
182+
store=store,
183+
filepath=filepath,
184+
group=group,
185+
drop_variables=drop_variables,
168186
)
169187
registry = ObjectStoreRegistry({filepath: store})
170188
# Convert to a manifest store
@@ -192,40 +210,21 @@ def open_virtual_dataset(
192210
filepath, fs_root=Path.cwd().as_uri()
193211
)
194212

195-
_drop_vars: list[Hashable] = (
213+
_drop_vars: Iterable[str] = (
196214
[] if drop_variables is None else list(drop_variables)
197215
)
198216

199-
# TODO provide a way to drop a variable _before_ h5py attempts to inspect it?
200-
virtual_vars = HDFVirtualBackend._virtual_vars_from_hdf(
201-
path=filepath,
202-
group=group,
203-
reader_options=reader_options,
204-
)
205-
206-
attrs = HDFVirtualBackend._get_group_attrs(
207-
path=filepath, reader_options=reader_options, group=group
208-
)
209-
coordinates_attr = attrs.pop("coordinates", "")
210-
coord_names = coordinates_attr.split()
211-
212-
fully_virtual_dataset = construct_fully_virtual_dataset(
213-
virtual_vars=virtual_vars,
214-
coord_names=coord_names,
215-
attrs=attrs,
216-
)
217-
218-
vds = construct_virtual_dataset(
219-
fully_virtual_ds=fully_virtual_dataset,
217+
manifest_store = HDFVirtualBackend._create_manifest_store(
220218
filepath=filepath,
219+
drop_variables=_drop_vars,
221220
group=group,
221+
)
222+
ds = manifest_store.to_virtual_dataset(
222223
loadable_variables=loadable_variables,
223-
reader_options=reader_options,
224-
indexes=indexes,
225224
decode_times=decode_times,
225+
indexes=indexes,
226226
)
227-
228-
return vds.drop_vars(_drop_vars)
227+
return ds
229228

230229
@staticmethod
231230
def _dataset_chunk_manifest(
@@ -346,29 +345,6 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
346345

347346
return [dim.removeprefix(group) for dim in dims]
348347

349-
@staticmethod
350-
def _extract_cf_fill_value(
351-
h5obj: Union[H5Dataset, H5Group],
352-
) -> Optional[FillValueType]:
353-
"""
354-
Convert the _FillValue attribute from an HDF5 group or dataset into
355-
encoding.
356-
357-
Parameters
358-
----------
359-
h5obj : h5py.Group or h5py.Dataset
360-
An h5py group or dataset.
361-
"""
362-
fillvalue = None
363-
for n, v in h5obj.attrs.items():
364-
if n == "_FillValue":
365-
if isinstance(v, np.ndarray) and v.size == 1:
366-
fillvalue = v.item()
367-
else:
368-
fillvalue = v
369-
fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type]
370-
return fillvalue
371-
372348
@staticmethod
373349
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
374350
"""
@@ -394,7 +370,7 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
394370
if n in _HIDDEN_ATTRS:
395371
continue
396372
if n == "_FillValue":
397-
continue
373+
v = v
398374
# Fix some attribute values to avoid JSON encoding exceptions...
399375
if isinstance(v, bytes):
400376
v = v.decode("utf-8") or " "
@@ -414,148 +390,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
414390
attrs[n] = v
415391
return attrs
416392

417-
@staticmethod
418-
def _dataset_to_variable(
419-
path: str,
420-
dataset: H5Dataset,
421-
group: str,
422-
) -> Optional[xr.Variable]:
423-
"""
424-
Extract an xarray Variable with ManifestArray data from an h5py dataset
425-
426-
Parameters
427-
----------
428-
dataset : h5py.Dataset
429-
An h5py dataset.
430-
group : str
431-
Name of the group containing this h5py.Dataset.
432-
433-
Returns
434-
-------
435-
list: xarray.Variable
436-
A list of xarray variables.
437-
"""
438-
chunks = dataset.chunks if dataset.chunks else dataset.shape
439-
codecs = codecs_from_dataset(dataset)
440-
cfcodec = cfcodec_from_dataset(dataset)
441-
attrs = HDFVirtualBackend._extract_attrs(dataset)
442-
cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
443-
attrs.pop("_FillValue", None)
444-
445-
if cfcodec:
446-
codecs.insert(0, cfcodec["codec"])
447-
dtype = cfcodec["target_dtype"]
448-
attrs.pop("scale_factor", None)
449-
attrs.pop("add_offset", None)
450-
else:
451-
dtype = dataset.dtype
452-
453-
codec_configs = [
454-
numcodec_config_to_configurable(codec.get_config()) for codec in codecs
455-
]
456-
457-
fill_value = dataset.fillvalue.item()
458-
metadata = create_v3_array_metadata(
459-
shape=dataset.shape,
460-
data_type=dtype,
461-
chunk_shape=chunks,
462-
fill_value=fill_value,
463-
codecs=codec_configs,
464-
)
465-
dims = HDFVirtualBackend._dataset_dims(dataset, group=group)
466-
manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
467-
if manifest:
468-
marray = ManifestArray(metadata=metadata, chunkmanifest=manifest)
469-
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
470-
else:
471-
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
472-
if cf_fill_value is not None:
473-
variable.encoding["_FillValue"] = cf_fill_value
474-
return variable
475-
476-
@staticmethod
477-
def _virtual_vars_from_hdf(
478-
path: str,
479-
group: Optional[str] = None,
480-
drop_variables: Optional[List[str]] = None,
481-
reader_options: Optional[dict] = {
482-
"storage_options": {"key": "", "secret": "", "anon": True}
483-
},
484-
) -> Dict[str, xr.Variable]:
485-
"""
486-
Extract xarray Variables with ManifestArray data from an HDF file or group
487-
488-
Parameters
489-
----------
490-
path: str
491-
The path of the hdf5 file.
492-
group: str, optional
493-
The name of the group for which to extract variables. None refers to the root group.
494-
drop_variables: list of str
495-
A list of variable names to skip extracting.
496-
reader_options: dict
497-
A dictionary of reader options passed to fsspec when opening the file.
498-
499-
Returns
500-
-------
501-
dict
502-
A dictionary of Xarray Variables with the variable names as keys.
503-
"""
504-
if drop_variables is None:
505-
drop_variables = []
506-
507-
open_file = _FsspecFSFromFilepath(
508-
filepath=path, reader_options=reader_options
509-
).open_file()
510-
f = h5py.File(open_file, mode="r")
511-
512-
if group is not None and group != "":
513-
g = f[group]
514-
group_name = group
515-
if not isinstance(g, h5py.Group):
516-
raise ValueError("The provided group is not an HDF group")
517-
else:
518-
g = f["/"]
519-
group_name = "/"
520-
521-
variables = {}
522-
non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
523-
group=g
524-
)
525-
drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
526-
for key in g.keys():
527-
if key not in drop_variables:
528-
if isinstance(g[key], h5py.Dataset):
529-
variable = HDFVirtualBackend._dataset_to_variable(
530-
path=path,
531-
dataset=g[key],
532-
group=group_name,
533-
)
534-
if variable is not None:
535-
variables[key] = variable
536-
return variables
537-
538-
@staticmethod
539-
def _get_group_attrs(
540-
path: str,
541-
group: Optional[str] = None,
542-
reader_options: Optional[dict] = {
543-
"storage_options": {"key": "", "secret": "", "anon": True}
544-
},
545-
):
546-
open_file = _FsspecFSFromFilepath(
547-
filepath=path, reader_options=reader_options
548-
).open_file()
549-
f = h5py.File(open_file, mode="r")
550-
if group:
551-
g = f[group]
552-
if not isinstance(g, h5py.Group):
553-
raise ValueError("The provided group is not an HDF group")
554-
else:
555-
g = f
556-
attrs = HDFVirtualBackend._extract_attrs(g)
557-
return attrs
558-
559393
@staticmethod
560394
def _find_non_coord_dimension_vars(group: H5Group) -> List[str]:
561395
dimension_names = []
@@ -569,3 +403,28 @@ def _find_non_coord_dimension_vars(group: H5Group) -> List[str]:
569403
non_coordinate_dimension_variables.append(name)
570404

571405
return non_coordinate_dimension_variables
406+
407+
@staticmethod
408+
def _encode_cf_fill_value(
409+
fill_value: Union[np.ndarray, np.generic],
410+
target_dtype: np.dtype,
411+
) -> FillValueType:
412+
"""
413+
Convert the _FillValue attribute from an HDF5 group or dataset into
414+
one properly encoded for the target dtype.
415+
416+
Parameters
417+
----------
418+
fill_value
419+
An ndarray or value.
420+
target_dtype
421+
The target dtype of the ManifestArray that will use the _FillValue
422+
"""
423+
if isinstance(fill_value, (np.ndarray, np.generic)):
424+
if isinstance(fill_value, np.ndarray) and fill_value.size > 1:
425+
raise ValueError("Expected a scalar")
426+
fillvalue = fill_value.item()
427+
else:
428+
fillvalue = fill_value
429+
encoded_fillvalue = FillValueCoder.encode(fillvalue, target_dtype)
430+
return encoded_fillvalue

virtualizarr/tests/test_manifests/test_store.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,18 @@ def test_default_object_store_s3(minio_bucket):
140140
assert isinstance(store, S3Store)
141141

142142

143+
@requires_obstore
144+
@requires_minio
145+
def test_default_object_store_http(minio_bucket):
146+
from obstore.store import HTTPStore
147+
148+
filepath = minio_bucket["endpoint"]
149+
store = default_object_store(
150+
filepath,
151+
)
152+
assert isinstance(store, HTTPStore)
153+
154+
143155
@requires_obstore
144156
def test_default_object_store_local(tmpdir):
145157
from obstore.store import LocalStore

0 commit comments

Comments
 (0)