4
4
from pathlib import Path
5
5
from typing import (
6
6
TYPE_CHECKING ,
7
+ Any ,
7
8
Dict ,
8
9
Hashable ,
9
10
Iterable ,
27
28
ChunkEntry ,
28
29
ChunkManifest ,
29
30
ManifestArray ,
31
+ ManifestGroup ,
32
+ ManifestStore ,
30
33
)
31
34
from virtualizarr .manifests .manifest import validate_and_normalize_path_to_uri
32
35
from virtualizarr .manifests .utils import create_v3_array_metadata
41
44
if TYPE_CHECKING :
42
45
from h5py import Dataset as H5Dataset
43
46
from h5py import Group as H5Group
47
+ from obstore .store import ObjectStore
44
48
45
49
FillValueType = Union [
46
50
int ,
58
62
59
63
60
64
class HDFVirtualBackend (VirtualBackend ):
65
+ @staticmethod
66
+ def _construct_manifest_array (
67
+ path : str ,
68
+ dataset : H5Dataset ,
69
+ group : str ,
70
+ ) -> ManifestArray :
71
+ """
72
+ Construct a ManifestArray from an h5py dataset
73
+ Parameters
74
+ ----------
75
+ path: str
76
+ The path of the hdf5 file.
77
+ dataset : h5py.Dataset
78
+ An h5py dataset.
79
+ group : str
80
+ Name of the group containing this h5py.Dataset.
81
+ Returns
82
+ -------
83
+ ManifestArray
84
+ """
85
+ chunks = dataset .chunks if dataset .chunks else dataset .shape
86
+ codecs = codecs_from_dataset (dataset )
87
+ attrs = HDFVirtualBackend ._extract_attrs (dataset )
88
+ dtype = dataset .dtype
89
+
90
+ codec_configs = [
91
+ numcodec_config_to_configurable (codec .get_config ()) for codec in codecs
92
+ ]
93
+
94
+ fill_value = dataset .fillvalue .item ()
95
+ dims = tuple (HDFVirtualBackend ._dataset_dims (dataset , group = group ))
96
+ metadata = create_v3_array_metadata (
97
+ shape = dataset .shape ,
98
+ data_type = dtype ,
99
+ chunk_shape = chunks ,
100
+ fill_value = fill_value ,
101
+ codecs = codec_configs ,
102
+ dimension_names = dims ,
103
+ attributes = attrs ,
104
+ )
105
+
106
+ manifest = HDFVirtualBackend ._dataset_chunk_manifest (path , dataset )
107
+ return ManifestArray (metadata = metadata , chunkmanifest = manifest )
108
+
109
+ @staticmethod
110
+ def _construct_manifest_group (
111
+ store : ObjectStore ,
112
+ filepath : str ,
113
+ * ,
114
+ group : str | None = None ,
115
+ drop_variables : Optional [List [str ]] = None ,
116
+ ) -> ManifestGroup :
117
+ """
118
+ Construct a virtual Group from a HDF dataset.
119
+ """
120
+ from virtualizarr .utils import ObstoreReader
121
+
122
+ if drop_variables is None :
123
+ drop_variables = []
124
+
125
+ reader = ObstoreReader (store = store , path = filepath )
126
+ f = h5py .File (reader , mode = "r" )
127
+
128
+ if group is not None and group != "" :
129
+ g = f [group ]
130
+ group_name = group
131
+ if not isinstance (g , h5py .Group ):
132
+ raise ValueError ("The provided group is not an HDF group" )
133
+ else :
134
+ g = f ["/" ]
135
+ group_name = "/"
136
+
137
+ manifest_dict = {}
138
+ non_coordinate_dimesion_vars = HDFVirtualBackend ._find_non_coord_dimension_vars (
139
+ group = g
140
+ )
141
+ drop_variables = list (set (drop_variables + non_coordinate_dimesion_vars ))
142
+ attrs : dict [str , Any ] = {}
143
+ for key in g .keys ():
144
+ if key not in drop_variables :
145
+ if isinstance (g [key ], h5py .Dataset ):
146
+ variable = HDFVirtualBackend ._construct_manifest_array (
147
+ path = filepath ,
148
+ dataset = g [key ],
149
+ group = group_name ,
150
+ )
151
+ if variable is not None :
152
+ manifest_dict [key ] = variable
153
+ return ManifestGroup (arrays = manifest_dict , attributes = attrs )
154
+
155
+ @staticmethod
156
+ def _create_manifest_store (
157
+ filepath : str ,
158
+ * ,
159
+ prefix : str ,
160
+ store : ObjectStore ,
161
+ group : str | None = None ,
162
+ ) -> ManifestStore :
163
+ # Create a group containing dataset level metadata and all the manifest arrays
164
+ manifest_group = HDFVirtualBackend ._construct_manifest_group (
165
+ store = store , filepath = filepath , group = group
166
+ )
167
+ # Convert to a manifest store
168
+ return ManifestStore (stores = {prefix : store }, group = manifest_group )
169
+
61
170
@staticmethod
62
171
def open_virtual_dataset (
63
172
filepath : str ,
@@ -119,7 +228,7 @@ def open_virtual_dataset(
119
228
def _dataset_chunk_manifest (
120
229
path : str ,
121
230
dataset : H5Dataset ,
122
- ) -> Optional [ ChunkManifest ] :
231
+ ) -> ChunkManifest :
123
232
"""
124
233
Generate ChunkManifest for HDF5 dataset.
125
234
@@ -138,7 +247,7 @@ def _dataset_chunk_manifest(
138
247
dsid = dataset .id
139
248
if dataset .chunks is None :
140
249
if dsid .get_offset () is None :
141
- return None
250
+ chunk_manifest = ChunkManifest ( entries = {}, shape = dataset . shape )
142
251
else :
143
252
key_list = [0 ] * (len (dataset .shape ) or 1 )
144
253
key = "." .join (map (str , key_list ))
@@ -149,42 +258,42 @@ def _dataset_chunk_manifest(
149
258
chunk_key = ChunkKey (key )
150
259
chunk_entries = {chunk_key : chunk_entry }
151
260
chunk_manifest = ChunkManifest (entries = chunk_entries )
152
- return chunk_manifest
153
261
else :
154
262
num_chunks = dsid .get_num_chunks ()
155
263
if num_chunks == 0 :
156
- raise ValueError ("The dataset is chunked but contains no chunks" )
157
- shape = tuple (
158
- math .ceil (a / b ) for a , b in zip (dataset .shape , dataset .chunks )
159
- )
160
- paths = np .empty (shape , dtype = np .dtypes .StringDType ) # type: ignore
161
- offsets = np .empty (shape , dtype = np .uint64 )
162
- lengths = np .empty (shape , dtype = np .uint64 )
163
-
164
- def get_key (blob ):
165
- return tuple (
166
- [a // b for a , b in zip (blob .chunk_offset , dataset .chunks )]
264
+ chunk_manifest = ChunkManifest (entries = {}, shape = dataset .shape )
265
+ else :
266
+ shape = tuple (
267
+ math .ceil (a / b ) for a , b in zip (dataset .shape , dataset .chunks )
167
268
)
269
+ paths = np .empty (shape , dtype = np .dtypes .StringDType ) # type: ignore
270
+ offsets = np .empty (shape , dtype = np .uint64 )
271
+ lengths = np .empty (shape , dtype = np .uint64 )
272
+
273
+ def get_key (blob ):
274
+ return tuple (
275
+ [a // b for a , b in zip (blob .chunk_offset , dataset .chunks )]
276
+ )
168
277
169
- def add_chunk_info (blob ):
170
- key = get_key (blob )
171
- paths [key ] = path
172
- offsets [key ] = blob .byte_offset
173
- lengths [key ] = blob .size
278
+ def add_chunk_info (blob ):
279
+ key = get_key (blob )
280
+ paths [key ] = path
281
+ offsets [key ] = blob .byte_offset
282
+ lengths [key ] = blob .size
174
283
175
- has_chunk_iter = callable (getattr (dsid , "chunk_iter" , None ))
176
- if has_chunk_iter :
177
- dsid .chunk_iter (add_chunk_info )
178
- else :
179
- for index in range (num_chunks ):
180
- add_chunk_info (dsid .get_chunk_info (index ))
284
+ has_chunk_iter = callable (getattr (dsid , "chunk_iter" , None ))
285
+ if has_chunk_iter :
286
+ dsid .chunk_iter (add_chunk_info )
287
+ else :
288
+ for index in range (num_chunks ):
289
+ add_chunk_info (dsid .get_chunk_info (index ))
181
290
182
- chunk_manifest = ChunkManifest .from_arrays (
183
- paths = paths , # type: ignore
184
- offsets = offsets ,
185
- lengths = lengths ,
186
- )
187
- return chunk_manifest
291
+ chunk_manifest = ChunkManifest .from_arrays (
292
+ paths = paths , # type: ignore
293
+ offsets = offsets ,
294
+ lengths = lengths ,
295
+ )
296
+ return chunk_manifest
188
297
189
298
@staticmethod
190
299
def _dataset_dims (dataset : H5Dataset , group : str = "" ) -> List [str ]:
0 commit comments