|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import asyncio |
4 | | -import base64 |
5 | | -import itertools |
6 | 4 | import json |
7 | 5 | import logging |
8 | 6 | import warnings |
@@ -100,315 +98,6 @@ def _parse_async_node( |
100 | 98 | raise TypeError(f"Unknown node type, got {type(node)}") |
101 | 99 |
|
102 | 100 |
|
103 | | -@dataclass(frozen=True) |
104 | | -class ConsolidatedMetadata: |
105 | | - """ |
106 | | - Consolidated Metadata for this Group. |
107 | | -
|
108 | | - This stores the metadata of child nodes below this group. Any child groups |
109 | | - will have their consolidated metadata set appropriately. |
110 | | - """ |
111 | | - |
112 | | - metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] |
113 | | - kind: Literal["inline"] = "inline" |
114 | | - must_understand: Literal[False] = False |
115 | | - |
116 | | - def to_dict(self) -> dict[str, JSON]: |
117 | | - return { |
118 | | - "kind": self.kind, |
119 | | - "must_understand": self.must_understand, |
120 | | - "metadata": {k: v.to_dict() for k, v in self.flattened_metadata.items()}, |
121 | | - } |
122 | | - |
123 | | - @classmethod |
124 | | - def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: |
125 | | - data = dict(data) |
126 | | - |
127 | | - kind = data.get("kind") |
128 | | - if kind != "inline": |
129 | | - raise ValueError(f"Consolidated metadata kind='{kind}' is not supported.") |
130 | | - |
131 | | - raw_metadata = data.get("metadata") |
132 | | - if not isinstance(raw_metadata, dict): |
133 | | - raise TypeError(f"Unexpected type for 'metadata': {type(raw_metadata)}") |
134 | | - |
135 | | - metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {} |
136 | | - if raw_metadata: |
137 | | - for k, v in raw_metadata.items(): |
138 | | - if not isinstance(v, dict): |
139 | | - raise TypeError( |
140 | | - f"Invalid value for metadata items. key='{k}', type='{type(v).__name__}'" |
141 | | - ) |
142 | | - |
143 | | - # zarr_format is present in v2 and v3. |
144 | | - zarr_format = parse_zarr_format(v["zarr_format"]) |
145 | | - |
146 | | - if zarr_format == 3: |
147 | | - node_type = parse_node_type(v.get("node_type", None)) |
148 | | - if node_type == "group": |
149 | | - metadata[k] = GroupMetadata.from_dict(v) |
150 | | - elif node_type == "array": |
151 | | - metadata[k] = ArrayV3Metadata.from_dict(v) |
152 | | - else: |
153 | | - assert_never(node_type) |
154 | | - elif zarr_format == 2: |
155 | | - if "shape" in v: |
156 | | - metadata[k] = ArrayV2Metadata.from_dict(v) |
157 | | - else: |
158 | | - metadata[k] = GroupMetadata.from_dict(v) |
159 | | - else: |
160 | | - assert_never(zarr_format) |
161 | | - |
162 | | - cls._flat_to_nested(metadata) |
163 | | - |
164 | | - return cls(metadata=metadata) |
165 | | - |
166 | | - @staticmethod |
167 | | - def _flat_to_nested( |
168 | | - metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata], |
169 | | - ) -> None: |
170 | | - """ |
171 | | - Convert a flat metadata representation to a nested one. |
172 | | -
|
173 | | - Notes |
174 | | - ----- |
175 | | - Flat metadata is used when persisting the consolidated metadata. The keys |
176 | | - include the full path, not just the node name. The key prefixes can be |
177 | | - used to determine which nodes are children of which other nodes. |
178 | | -
|
179 | | - Nested metadata is used in-memory. The outermost level will only have the |
180 | | - *immediate* children of the Group. All nested child groups will be stored |
181 | | - under the consolidated metadata of their immediate parent. |
182 | | - """ |
183 | | - # We have a flat mapping from {k: v} where the keys include the *full* |
184 | | - # path segment: |
185 | | - # { |
186 | | - # "/a/b": { group_metadata }, |
187 | | - # "/a/b/array-0": { array_metadata }, |
188 | | - # "/a/b/array-1": { array_metadata }, |
189 | | - # } |
190 | | - # |
191 | | - # We want to reorganize the metadata such that each Group contains the |
192 | | - # array metadata of its immediate children. |
193 | | - # In the example, the group at `/a/b` will have consolidated metadata |
194 | | - # for its children `array-0` and `array-1`. |
195 | | - # |
196 | | - # metadata = dict(metadata) |
197 | | - |
198 | | - keys = sorted(metadata, key=lambda k: k.count("/")) |
199 | | - grouped = { |
200 | | - k: list(v) for k, v in itertools.groupby(keys, key=lambda k: k.rsplit("/", 1)[0]) |
201 | | - } |
202 | | - |
203 | | - # we go top down and directly manipulate metadata. |
204 | | - for key, children_keys in grouped.items(): |
205 | | - # key is a key like "a", "a/b", "a/b/c" |
206 | | - # The basic idea is to find the immediate parent (so "", "a", or "a/b") |
207 | | - # and update that node's consolidated metadata to include the metadata |
208 | | - # in children_keys |
209 | | - *prefixes, name = key.split("/") |
210 | | - parent = metadata |
211 | | - |
212 | | - while prefixes: |
213 | | - # e.g. a/b/c has a parent "a/b". Walk through to get |
214 | | - # metadata["a"]["b"] |
215 | | - part = prefixes.pop(0) |
216 | | - # we can assume that parent[part] here is a group |
217 | | - # otherwise we wouldn't have a node with this `part` prefix. |
218 | | - # We can also assume that the parent node will have consolidated metadata, |
219 | | - # because we're walking top to bottom. |
220 | | - parent = parent[part].consolidated_metadata.metadata # type: ignore[union-attr] |
221 | | - |
222 | | - node = parent[name] |
223 | | - children_keys = list(children_keys) |
224 | | - |
225 | | - if isinstance(node, ArrayV2Metadata | ArrayV3Metadata): |
226 | | - # These are already present, either thanks to being an array in the |
227 | | - # root, or by being collected as a child in the else clause |
228 | | - continue |
229 | | - children_keys = list(children_keys) |
230 | | - # We pop from metadata, since we're *moving* this under group |
231 | | - children = { |
232 | | - child_key.split("/")[-1]: metadata.pop(child_key) |
233 | | - for child_key in children_keys |
234 | | - if child_key != key |
235 | | - } |
236 | | - parent[name] = replace( |
237 | | - node, consolidated_metadata=ConsolidatedMetadata(metadata=children) |
238 | | - ) |
239 | | - |
240 | | - @property |
241 | | - def flattened_metadata(self) -> dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata]: |
242 | | - """ |
243 | | - Return the flattened representation of Consolidated Metadata. |
244 | | -
|
245 | | - The returned dictionary will have a key for each child node in the hierarchy |
246 | | - under this group. Under the default (nested) representation available through |
247 | | - ``self.metadata``, the dictionary only contains keys for immediate children. |
248 | | -
|
249 | | - The keys of the dictionary will include the full path to a child node from |
250 | | - the current group, where segments are joined by ``/``. |
251 | | -
|
252 | | - Examples |
253 | | - -------- |
254 | | - >>> cm = ConsolidatedMetadata( |
255 | | - ... metadata={ |
256 | | - ... "group-0": GroupMetadata( |
257 | | - ... consolidated_metadata=ConsolidatedMetadata( |
258 | | - ... { |
259 | | - ... "group-0-0": GroupMetadata(), |
260 | | - ... } |
261 | | - ... ) |
262 | | - ... ), |
263 | | - ... "group-1": GroupMetadata(), |
264 | | - ... } |
265 | | - ... ) |
266 | | - {'group-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), |
267 | | - 'group-0/group-0-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), |
268 | | - 'group-1': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group')} |
269 | | - """ |
270 | | - metadata = {} |
271 | | - |
272 | | - def flatten( |
273 | | - key: str, group: GroupMetadata | ArrayV2Metadata | ArrayV3Metadata |
274 | | - ) -> dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata]: |
275 | | - children: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {} |
276 | | - if isinstance(group, ArrayV2Metadata | ArrayV3Metadata): |
277 | | - children[key] = group |
278 | | - else: |
279 | | - if group.consolidated_metadata and group.consolidated_metadata.metadata is not None: |
280 | | - children[key] = replace( |
281 | | - group, consolidated_metadata=ConsolidatedMetadata(metadata={}) |
282 | | - ) |
283 | | - for name, val in group.consolidated_metadata.metadata.items(): |
284 | | - full_key = f"{key}/{name}" |
285 | | - if isinstance(val, GroupMetadata): |
286 | | - children.update(flatten(full_key, val)) |
287 | | - else: |
288 | | - children[full_key] = val |
289 | | - else: |
290 | | - children[key] = replace(group, consolidated_metadata=None) |
291 | | - return children |
292 | | - |
293 | | - for k, v in self.metadata.items(): |
294 | | - metadata.update(flatten(k, v)) |
295 | | - |
296 | | - return metadata |
297 | | - |
298 | | - |
299 | | -@dataclass(frozen=True) |
300 | | -class GroupMetadata(Metadata): |
301 | | - """ |
302 | | - Metadata for a Group. |
303 | | - """ |
304 | | - |
305 | | - attributes: dict[str, Any] = field(default_factory=dict) |
306 | | - zarr_format: ZarrFormat = 3 |
307 | | - consolidated_metadata: ConsolidatedMetadata | None = None |
308 | | - node_type: Literal["group"] = field(default="group", init=False) |
309 | | - |
310 | | - def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: |
311 | | - json_indent = config.get("json_indent") |
312 | | - if self.zarr_format == 3: |
313 | | - return { |
314 | | - ZARR_JSON: prototype.buffer.from_bytes( |
315 | | - json.dumps(_replace_special_floats(self.to_dict()), cls=V3JsonEncoder).encode() |
316 | | - ) |
317 | | - } |
318 | | - else: |
319 | | - items = { |
320 | | - ZGROUP_JSON: prototype.buffer.from_bytes( |
321 | | - json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode() |
322 | | - ), |
323 | | - ZATTRS_JSON: prototype.buffer.from_bytes( |
324 | | - json.dumps(self.attributes, indent=json_indent).encode() |
325 | | - ), |
326 | | - } |
327 | | - if self.consolidated_metadata: |
328 | | - d = { |
329 | | - ZGROUP_JSON: {"zarr_format": self.zarr_format}, |
330 | | - ZATTRS_JSON: self.attributes, |
331 | | - } |
332 | | - consolidated_metadata = self.consolidated_metadata.to_dict()["metadata"] |
333 | | - assert isinstance(consolidated_metadata, dict) |
334 | | - for k, v in consolidated_metadata.items(): |
335 | | - attrs = v.pop("attributes", None) |
336 | | - d[f"{k}/{ZATTRS_JSON}"] = _replace_special_floats(attrs) |
337 | | - if "shape" in v: |
338 | | - # it's an array |
339 | | - if isinstance(v.get("fill_value", None), np.void): |
340 | | - v["fill_value"] = base64.standard_b64encode( |
341 | | - cast(bytes, v["fill_value"]) |
342 | | - ).decode("ascii") |
343 | | - else: |
344 | | - v = _replace_special_floats(v) |
345 | | - d[f"{k}/{ZARRAY_JSON}"] = v |
346 | | - else: |
347 | | - d[f"{k}/{ZGROUP_JSON}"] = { |
348 | | - "zarr_format": self.zarr_format, |
349 | | - "consolidated_metadata": { |
350 | | - "metadata": {}, |
351 | | - "must_understand": False, |
352 | | - "kind": "inline", |
353 | | - }, |
354 | | - } |
355 | | - |
356 | | - items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( |
357 | | - json.dumps( |
358 | | - {"metadata": d, "zarr_consolidated_format": 1}, |
359 | | - cls=V3JsonEncoder, |
360 | | - ).encode() |
361 | | - ) |
362 | | - |
363 | | - return items |
364 | | - |
365 | | - def __init__( |
366 | | - self, |
367 | | - attributes: dict[str, Any] | None = None, |
368 | | - zarr_format: ZarrFormat = 3, |
369 | | - consolidated_metadata: ConsolidatedMetadata | None = None, |
370 | | - ) -> None: |
371 | | - attributes_parsed = parse_attributes(attributes) |
372 | | - zarr_format_parsed = parse_zarr_format(zarr_format) |
373 | | - |
374 | | - object.__setattr__(self, "attributes", attributes_parsed) |
375 | | - object.__setattr__(self, "zarr_format", zarr_format_parsed) |
376 | | - object.__setattr__(self, "consolidated_metadata", consolidated_metadata) |
377 | | - |
378 | | - @classmethod |
379 | | - def from_dict(cls, data: dict[str, Any]) -> GroupMetadata: |
380 | | - data = dict(data) |
381 | | - assert data.pop("node_type", None) in ("group", None) |
382 | | - consolidated_metadata = data.pop("consolidated_metadata", None) |
383 | | - if consolidated_metadata: |
384 | | - data["consolidated_metadata"] = ConsolidatedMetadata.from_dict(consolidated_metadata) |
385 | | - |
386 | | - zarr_format = data.get("zarr_format") |
387 | | - if zarr_format == 2 or zarr_format is None: |
388 | | - # zarr v2 allowed arbitrary keys here. |
389 | | - # We don't want the GroupMetadata constructor to fail just because someone put an |
390 | | - # extra key in the metadata. |
391 | | - expected = {x.name for x in fields(cls)} |
392 | | - data = {k: v for k, v in data.items() if k in expected} |
393 | | - |
394 | | - return cls(**data) |
395 | | - |
396 | | - def to_dict(self) -> dict[str, Any]: |
397 | | - result = asdict(replace(self, consolidated_metadata=None)) |
398 | | - if self.consolidated_metadata: |
399 | | - result["consolidated_metadata"] = self.consolidated_metadata.to_dict() |
400 | | - return result |
401 | | - |
402 | | - |
403 | | -@dataclass(frozen=True) |
404 | | -class ImplicitGroupMarker(GroupMetadata): |
405 | | - """ |
406 | | - Marker for an implicit group. Instances of this class are only used in the context of group |
407 | | - creation as a placeholder to represent groups that should only be created if they do not |
408 | | - already exist in storage |
409 | | - """ |
410 | | - |
411 | | - |
412 | 101 | @dataclass(frozen=True) |
413 | 102 | class AsyncGroup: |
414 | 103 | """ |
|
0 commit comments