44from pathlib import Path
55from typing import (
66 TYPE_CHECKING ,
7- Any ,
8- Dict ,
9- Hashable ,
107 Iterable ,
118 List ,
129 Mapping ,
3128from virtualizarr .manifests .store import ObjectStoreRegistry , default_object_store
3229from virtualizarr .manifests .utils import create_v3_array_metadata
3330from virtualizarr .readers .api import VirtualBackend
34- from virtualizarr .readers .hdf .filters import cfcodec_from_dataset , codecs_from_dataset
31+ from virtualizarr .readers .hdf .filters import codecs_from_dataset
3532from virtualizarr .types import ChunkKey
36- from virtualizarr .utils import _FsspecFSFromFilepath , soft_import
37- from virtualizarr .xarray import (
38- construct_fully_virtual_dataset ,
39- construct_virtual_dataset ,
40- )
33+ from virtualizarr .utils import soft_import
4134
4235h5py = soft_import ("h5py" , "For reading hdf files" , strict = False )
4336
@@ -88,6 +81,22 @@ def _construct_manifest_array(
8881 attrs = HDFVirtualBackend ._extract_attrs (dataset )
8982 dtype = dataset .dtype
9083
84+ # Temporarily disable use CF->Codecs - TODO re-enable in subsequent PR.
85+ # cfcodec = cfcodec_from_dataset(dataset)
86+ # if cfcodec:
87+ # codecs.insert(0, cfcodec["codec"])
88+ # dtype = cfcodec["target_dtype"]
89+ # attrs.pop("scale_factor", None)
90+ # attrs.pop("add_offset", None)
91+ # else:
92+ # dtype = dataset.dtype
93+
94+ if "_FillValue" in attrs :
95+ encoded_cf_fill_value = HDFVirtualBackend ._encode_cf_fill_value (
96+ attrs ["_FillValue" ], dtype
97+ )
98+ attrs ["_FillValue" ] = encoded_cf_fill_value
99+
91100 codec_configs = [
92101 numcodec_config_to_configurable (codec .get_config ()) for codec in codecs
93102 ]
@@ -113,7 +122,7 @@ def _construct_manifest_group(
113122 filepath : str ,
114123 * ,
115124 group : str | None = None ,
116- drop_variables : Optional [List [str ]] = None ,
125+ drop_variables : Optional [Iterable [str ]] = None ,
117126 ) -> ManifestGroup :
118127 """
119128 Construct a virtual Group from a HDF dataset.
@@ -136,11 +145,16 @@ def _construct_manifest_group(
136145 group_name = "/"
137146
138147 manifest_dict = {}
139- non_coordinate_dimesion_vars = HDFVirtualBackend ._find_non_coord_dimension_vars (
140- group = g
148+ # Several of our test fixtures which use xr.tutorial data have
149+ # non coord dimensions serialized using big endian dtypes which are not
150+ # yet supported in zarr-python v3. We'll drop these variables for the
151+ # moment until big endian support is included upstream.)
152+
153+ non_coordinate_dimension_vars = (
154+ HDFVirtualBackend ._find_non_coord_dimension_vars (group = g )
141155 )
142- drop_variables = list (set (drop_variables + non_coordinate_dimesion_vars ))
143- attrs : dict [ str , Any ] = {}
156+ drop_variables = list (set (list ( drop_variables ) + non_coordinate_dimension_vars ))
157+ attrs = HDFVirtualBackend . _extract_attrs ( g )
144158 for key in g .keys ():
145159 if key not in drop_variables :
146160 if isinstance (g [key ], h5py .Dataset ):
@@ -159,12 +173,16 @@ def _create_manifest_store(
159173 * ,
160174 store : ObjectStore | None = None ,
161175 group : str | None = None ,
176+ drop_variables : Iterable [str ] | None = None ,
162177 ) -> ManifestStore :
163178 # Create a group containing dataset level metadata and all the manifest arrays
164179 if not store :
165180 store = default_object_store (filepath ) # type: ignore
166181 manifest_group = HDFVirtualBackend ._construct_manifest_group (
167- store = store , filepath = filepath , group = group
182+ store = store ,
183+ filepath = filepath ,
184+ group = group ,
185+ drop_variables = drop_variables ,
168186 )
169187 registry = ObjectStoreRegistry ({filepath : store })
170188 # Convert to a manifest store
@@ -192,40 +210,21 @@ def open_virtual_dataset(
192210 filepath , fs_root = Path .cwd ().as_uri ()
193211 )
194212
195- _drop_vars : list [ Hashable ] = (
213+ _drop_vars : Iterable [ str ] = (
196214 [] if drop_variables is None else list (drop_variables )
197215 )
198216
199- # TODO provide a way to drop a variable _before_ h5py attempts to inspect it?
200- virtual_vars = HDFVirtualBackend ._virtual_vars_from_hdf (
201- path = filepath ,
202- group = group ,
203- reader_options = reader_options ,
204- )
205-
206- attrs = HDFVirtualBackend ._get_group_attrs (
207- path = filepath , reader_options = reader_options , group = group
208- )
209- coordinates_attr = attrs .pop ("coordinates" , "" )
210- coord_names = coordinates_attr .split ()
211-
212- fully_virtual_dataset = construct_fully_virtual_dataset (
213- virtual_vars = virtual_vars ,
214- coord_names = coord_names ,
215- attrs = attrs ,
216- )
217-
218- vds = construct_virtual_dataset (
219- fully_virtual_ds = fully_virtual_dataset ,
217+ manifest_store = HDFVirtualBackend ._create_manifest_store (
220218 filepath = filepath ,
219+ drop_variables = _drop_vars ,
221220 group = group ,
221+ )
222+ ds = manifest_store .to_virtual_dataset (
222223 loadable_variables = loadable_variables ,
223- reader_options = reader_options ,
224- indexes = indexes ,
225224 decode_times = decode_times ,
225+ indexes = indexes ,
226226 )
227-
228- return vds .drop_vars (_drop_vars )
227+ return ds
229228
230229 @staticmethod
231230 def _dataset_chunk_manifest (
@@ -346,29 +345,6 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
346345
347346 return [dim .removeprefix (group ) for dim in dims ]
348347
349- @staticmethod
350- def _extract_cf_fill_value (
351- h5obj : Union [H5Dataset , H5Group ],
352- ) -> Optional [FillValueType ]:
353- """
354- Convert the _FillValue attribute from an HDF5 group or dataset into
355- encoding.
356-
357- Parameters
358- ----------
359- h5obj : h5py.Group or h5py.Dataset
360- An h5py group or dataset.
361- """
362- fillvalue = None
363- for n , v in h5obj .attrs .items ():
364- if n == "_FillValue" :
365- if isinstance (v , np .ndarray ) and v .size == 1 :
366- fillvalue = v .item ()
367- else :
368- fillvalue = v
369- fillvalue = FillValueCoder .encode (fillvalue , h5obj .dtype ) # type: ignore[arg-type]
370- return fillvalue
371-
372348 @staticmethod
373349 def _extract_attrs (h5obj : Union [H5Dataset , H5Group ]):
374350 """
@@ -394,7 +370,7 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
394370 if n in _HIDDEN_ATTRS :
395371 continue
396372 if n == "_FillValue" :
397- continue
373+ v = v
398374 # Fix some attribute values to avoid JSON encoding exceptions...
399375 if isinstance (v , bytes ):
400376 v = v .decode ("utf-8" ) or " "
@@ -414,148 +390,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
414390 attrs [n ] = v
415391 return attrs
416392
417- @staticmethod
418- def _dataset_to_variable (
419- path : str ,
420- dataset : H5Dataset ,
421- group : str ,
422- ) -> Optional [xr .Variable ]:
423- """
424- Extract an xarray Variable with ManifestArray data from an h5py dataset
425-
426- Parameters
427- ----------
428- dataset : h5py.Dataset
429- An h5py dataset.
430- group : str
431- Name of the group containing this h5py.Dataset.
432-
433- Returns
434- -------
435- list: xarray.Variable
436- A list of xarray variables.
437- """
438- chunks = dataset .chunks if dataset .chunks else dataset .shape
439- codecs = codecs_from_dataset (dataset )
440- cfcodec = cfcodec_from_dataset (dataset )
441- attrs = HDFVirtualBackend ._extract_attrs (dataset )
442- cf_fill_value = HDFVirtualBackend ._extract_cf_fill_value (dataset )
443- attrs .pop ("_FillValue" , None )
444-
445- if cfcodec :
446- codecs .insert (0 , cfcodec ["codec" ])
447- dtype = cfcodec ["target_dtype" ]
448- attrs .pop ("scale_factor" , None )
449- attrs .pop ("add_offset" , None )
450- else :
451- dtype = dataset .dtype
452-
453- codec_configs = [
454- numcodec_config_to_configurable (codec .get_config ()) for codec in codecs
455- ]
456-
457- fill_value = dataset .fillvalue .item ()
458- metadata = create_v3_array_metadata (
459- shape = dataset .shape ,
460- data_type = dtype ,
461- chunk_shape = chunks ,
462- fill_value = fill_value ,
463- codecs = codec_configs ,
464- )
465- dims = HDFVirtualBackend ._dataset_dims (dataset , group = group )
466- manifest = HDFVirtualBackend ._dataset_chunk_manifest (path , dataset )
467- if manifest :
468- marray = ManifestArray (metadata = metadata , chunkmanifest = manifest )
469- variable = xr .Variable (data = marray , dims = dims , attrs = attrs )
470- else :
471- variable = xr .Variable (data = np .empty (dataset .shape ), dims = dims , attrs = attrs )
472- if cf_fill_value is not None :
473- variable .encoding ["_FillValue" ] = cf_fill_value
474- return variable
475-
476- @staticmethod
477- def _virtual_vars_from_hdf (
478- path : str ,
479- group : Optional [str ] = None ,
480- drop_variables : Optional [List [str ]] = None ,
481- reader_options : Optional [dict ] = {
482- "storage_options" : {"key" : "" , "secret" : "" , "anon" : True }
483- },
484- ) -> Dict [str , xr .Variable ]:
485- """
486- Extract xarray Variables with ManifestArray data from an HDF file or group
487-
488- Parameters
489- ----------
490- path: str
491- The path of the hdf5 file.
492- group: str, optional
493- The name of the group for which to extract variables. None refers to the root group.
494- drop_variables: list of str
495- A list of variable names to skip extracting.
496- reader_options: dict
497- A dictionary of reader options passed to fsspec when opening the file.
498-
499- Returns
500- -------
501- dict
502- A dictionary of Xarray Variables with the variable names as keys.
503- """
504- if drop_variables is None :
505- drop_variables = []
506-
507- open_file = _FsspecFSFromFilepath (
508- filepath = path , reader_options = reader_options
509- ).open_file ()
510- f = h5py .File (open_file , mode = "r" )
511-
512- if group is not None and group != "" :
513- g = f [group ]
514- group_name = group
515- if not isinstance (g , h5py .Group ):
516- raise ValueError ("The provided group is not an HDF group" )
517- else :
518- g = f ["/" ]
519- group_name = "/"
520-
521- variables = {}
522- non_coordinate_dimesion_vars = HDFVirtualBackend ._find_non_coord_dimension_vars (
523- group = g
524- )
525- drop_variables = list (set (drop_variables + non_coordinate_dimesion_vars ))
526- for key in g .keys ():
527- if key not in drop_variables :
528- if isinstance (g [key ], h5py .Dataset ):
529- variable = HDFVirtualBackend ._dataset_to_variable (
530- path = path ,
531- dataset = g [key ],
532- group = group_name ,
533- )
534- if variable is not None :
535- variables [key ] = variable
536- return variables
537-
538- @staticmethod
539- def _get_group_attrs (
540- path : str ,
541- group : Optional [str ] = None ,
542- reader_options : Optional [dict ] = {
543- "storage_options" : {"key" : "" , "secret" : "" , "anon" : True }
544- },
545- ):
546- open_file = _FsspecFSFromFilepath (
547- filepath = path , reader_options = reader_options
548- ).open_file ()
549- f = h5py .File (open_file , mode = "r" )
550- if group :
551- g = f [group ]
552- if not isinstance (g , h5py .Group ):
553- raise ValueError ("The provided group is not an HDF group" )
554- else :
555- g = f
556- attrs = HDFVirtualBackend ._extract_attrs (g )
557- return attrs
558-
559393 @staticmethod
560394 def _find_non_coord_dimension_vars (group : H5Group ) -> List [str ]:
561395 dimension_names = []
@@ -569,3 +403,28 @@ def _find_non_coord_dimension_vars(group: H5Group) -> List[str]:
569403 non_coordinate_dimension_variables .append (name )
570404
571405 return non_coordinate_dimension_variables
406+
407+ @staticmethod
408+ def _encode_cf_fill_value (
409+ fill_value : Union [np .ndarray , np .generic ],
410+ target_dtype : np .dtype ,
411+ ) -> FillValueType :
412+ """
413+ Convert the _FillValue attribute from an HDF5 group or dataset into
414+ one properly encoded for the target dtype.
415+
416+ Parameters
417+ ----------
418+ fill_value
419+ An ndarray or value.
420+ target_dtype
421+ The target dtype of the ManifestArray that will use the _FillValue
422+ """
423+ if isinstance (fill_value , (np .ndarray , np .generic )):
424+ if isinstance (fill_value , np .ndarray ) and fill_value .size > 1 :
425+ raise ValueError ("Expected a scalar" )
426+ fillvalue = fill_value .item ()
427+ else :
428+ fillvalue = fill_value
429+ encoded_fillvalue = FillValueCoder .encode (fillvalue , target_dtype )
430+ return encoded_fillvalue
0 commit comments