diff --git a/changes/3019.misc.rst b/changes/3019.misc.rst new file mode 100644 index 0000000000..aa1de969df --- /dev/null +++ b/changes/3019.misc.rst @@ -0,0 +1 @@ +Move ``GroupMetadata`` and related classes a group-specific module inside the ``metadata`` module. \ No newline at end of file diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 59261cca8a..ab1fcaf9aa 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -33,11 +33,15 @@ ) from zarr.core.group import ( AsyncGroup, + create_hierarchy, +) +from zarr.core.metadata import ( + ArrayMetadataDict, + ArrayV2Metadata, + ArrayV3Metadata, ConsolidatedMetadata, GroupMetadata, - create_hierarchy, ) -from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError from zarr.storage._common import make_store_path diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index cf4c36cc22..ebc6e7a1f0 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -99,6 +99,7 @@ ArrayV2MetadataDict, ArrayV3Metadata, ArrayV3MetadataDict, + GroupMetadata, T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( @@ -3761,7 +3762,7 @@ async def chunks_initialized( def _build_parents( node: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup, ) -> list[AsyncGroup]: - from zarr.core.group import AsyncGroup, GroupMetadata + from zarr.core.group import AsyncGroup store = node.store_path.store path = node.store_path.path diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 5c470e29ca..3063ce7a99 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1,16 +1,14 @@ from __future__ import annotations import asyncio -import base64 -import itertools import json import logging import warnings from collections import defaultdict from collections.abc import Iterator, Mapping -from dataclasses import asdict, dataclass, field, fields, replace +from dataclasses import dataclass, replace from itertools import accumulate -from typing import TYPE_CHECKING, Literal, TypeVar, assert_never, cast, overload +from typing import TYPE_CHECKING, Literal, TypeVar, cast, overload import numpy as np import numpy.typing as npt @@ -18,7 +16,6 @@ import zarr.api.asynchronous as async_api from zarr._compat import _deprecate_positional_args -from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo from zarr.core.array import ( @@ -44,14 +41,17 @@ ZMETADATA_V2_JSON, ChunkCoords, DimensionNames, - NodeType, ShapeLike, ZarrFormat, parse_shapelike, ) from zarr.core.config import config -from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v3 import V3JsonEncoder, _replace_special_floats +from zarr.core.metadata import ( + ArrayV2Metadata, + ArrayV3Metadata, + GroupMetadata, + ImplicitGroupMarker, +) from zarr.core.sync import SyncMixin, sync from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataValidationError from zarr.storage import StoreLike, StorePath @@ -69,7 +69,7 @@ from typing import Any from zarr.core.array_spec import ArrayConfig, ArrayConfigLike - from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.buffer import Buffer from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike from zarr.core.common import MemoryOrder @@ -78,32 +78,6 @@ DefaultT = TypeVar("DefaultT") -def parse_zarr_format(data: Any) -> ZarrFormat: - """Parse the zarr_format field from metadata.""" - if data in (2, 3): - return cast(ZarrFormat, data) - msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." - raise ValueError(msg) - - -def parse_node_type(data: Any) -> NodeType: - """Parse the node_type field from metadata.""" - if data in ("array", "group"): - return cast(Literal["array", "group"], data) - raise MetadataValidationError("node_type", "array or group", data) - - -# todo: convert None to empty dict -def parse_attributes(data: Any) -> dict[str, Any]: - """Parse the attributes field from metadata.""" - if data is None: - return {} - elif isinstance(data, dict) and all(isinstance(k, str) for k in data): - return data - msg = f"Expected dict with string keys. Got {type(data)} instead." - raise TypeError(msg) - - @overload def _parse_async_node(node: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]) -> Array: ... @@ -124,315 +98,6 @@ def _parse_async_node( raise TypeError(f"Unknown node type, got {type(node)}") -@dataclass(frozen=True) -class ConsolidatedMetadata: - """ - Consolidated Metadata for this Group. - - This stores the metadata of child nodes below this group. Any child groups - will have their consolidated metadata set appropriately. - """ - - metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] - kind: Literal["inline"] = "inline" - must_understand: Literal[False] = False - - def to_dict(self) -> dict[str, JSON]: - return { - "kind": self.kind, - "must_understand": self.must_understand, - "metadata": {k: v.to_dict() for k, v in self.flattened_metadata.items()}, - } - - @classmethod - def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: - data = dict(data) - - kind = data.get("kind") - if kind != "inline": - raise ValueError(f"Consolidated metadata kind='{kind}' is not supported.") - - raw_metadata = data.get("metadata") - if not isinstance(raw_metadata, dict): - raise TypeError(f"Unexpected type for 'metadata': {type(raw_metadata)}") - - metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {} - if raw_metadata: - for k, v in raw_metadata.items(): - if not isinstance(v, dict): - raise TypeError( - f"Invalid value for metadata items. key='{k}', type='{type(v).__name__}'" - ) - - # zarr_format is present in v2 and v3. - zarr_format = parse_zarr_format(v["zarr_format"]) - - if zarr_format == 3: - node_type = parse_node_type(v.get("node_type", None)) - if node_type == "group": - metadata[k] = GroupMetadata.from_dict(v) - elif node_type == "array": - metadata[k] = ArrayV3Metadata.from_dict(v) - else: - assert_never(node_type) - elif zarr_format == 2: - if "shape" in v: - metadata[k] = ArrayV2Metadata.from_dict(v) - else: - metadata[k] = GroupMetadata.from_dict(v) - else: - assert_never(zarr_format) - - cls._flat_to_nested(metadata) - - return cls(metadata=metadata) - - @staticmethod - def _flat_to_nested( - metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata], - ) -> None: - """ - Convert a flat metadata representation to a nested one. - - Notes - ----- - Flat metadata is used when persisting the consolidated metadata. The keys - include the full path, not just the node name. The key prefixes can be - used to determine which nodes are children of which other nodes. - - Nested metadata is used in-memory. The outermost level will only have the - *immediate* children of the Group. All nested child groups will be stored - under the consolidated metadata of their immediate parent. - """ - # We have a flat mapping from {k: v} where the keys include the *full* - # path segment: - # { - # "/a/b": { group_metadata }, - # "/a/b/array-0": { array_metadata }, - # "/a/b/array-1": { array_metadata }, - # } - # - # We want to reorganize the metadata such that each Group contains the - # array metadata of its immediate children. - # In the example, the group at `/a/b` will have consolidated metadata - # for its children `array-0` and `array-1`. - # - # metadata = dict(metadata) - - keys = sorted(metadata, key=lambda k: k.count("/")) - grouped = { - k: list(v) for k, v in itertools.groupby(keys, key=lambda k: k.rsplit("/", 1)[0]) - } - - # we go top down and directly manipulate metadata. - for key, children_keys in grouped.items(): - # key is a key like "a", "a/b", "a/b/c" - # The basic idea is to find the immediate parent (so "", "a", or "a/b") - # and update that node's consolidated metadata to include the metadata - # in children_keys - *prefixes, name = key.split("/") - parent = metadata - - while prefixes: - # e.g. a/b/c has a parent "a/b". Walk through to get - # metadata["a"]["b"] - part = prefixes.pop(0) - # we can assume that parent[part] here is a group - # otherwise we wouldn't have a node with this `part` prefix. - # We can also assume that the parent node will have consolidated metadata, - # because we're walking top to bottom. - parent = parent[part].consolidated_metadata.metadata # type: ignore[union-attr] - - node = parent[name] - children_keys = list(children_keys) - - if isinstance(node, ArrayV2Metadata | ArrayV3Metadata): - # These are already present, either thanks to being an array in the - # root, or by being collected as a child in the else clause - continue - children_keys = list(children_keys) - # We pop from metadata, since we're *moving* this under group - children = { - child_key.split("/")[-1]: metadata.pop(child_key) - for child_key in children_keys - if child_key != key - } - parent[name] = replace( - node, consolidated_metadata=ConsolidatedMetadata(metadata=children) - ) - - @property - def flattened_metadata(self) -> dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata]: - """ - Return the flattened representation of Consolidated Metadata. - - The returned dictionary will have a key for each child node in the hierarchy - under this group. Under the default (nested) representation available through - ``self.metadata``, the dictionary only contains keys for immediate children. - - The keys of the dictionary will include the full path to a child node from - the current group, where segments are joined by ``/``. - - Examples - -------- - >>> cm = ConsolidatedMetadata( - ... metadata={ - ... "group-0": GroupMetadata( - ... consolidated_metadata=ConsolidatedMetadata( - ... { - ... "group-0-0": GroupMetadata(), - ... } - ... ) - ... ), - ... "group-1": GroupMetadata(), - ... } - ... ) - {'group-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), - 'group-0/group-0-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), - 'group-1': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group')} - """ - metadata = {} - - def flatten( - key: str, group: GroupMetadata | ArrayV2Metadata | ArrayV3Metadata - ) -> dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata]: - children: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {} - if isinstance(group, ArrayV2Metadata | ArrayV3Metadata): - children[key] = group - else: - if group.consolidated_metadata and group.consolidated_metadata.metadata is not None: - children[key] = replace( - group, consolidated_metadata=ConsolidatedMetadata(metadata={}) - ) - for name, val in group.consolidated_metadata.metadata.items(): - full_key = f"{key}/{name}" - if isinstance(val, GroupMetadata): - children.update(flatten(full_key, val)) - else: - children[full_key] = val - else: - children[key] = replace(group, consolidated_metadata=None) - return children - - for k, v in self.metadata.items(): - metadata.update(flatten(k, v)) - - return metadata - - -@dataclass(frozen=True) -class GroupMetadata(Metadata): - """ - Metadata for a Group. - """ - - attributes: dict[str, Any] = field(default_factory=dict) - zarr_format: ZarrFormat = 3 - consolidated_metadata: ConsolidatedMetadata | None = None - node_type: Literal["group"] = field(default="group", init=False) - - def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - json_indent = config.get("json_indent") - if self.zarr_format == 3: - return { - ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(_replace_special_floats(self.to_dict()), cls=V3JsonEncoder).encode() - ) - } - else: - items = { - ZGROUP_JSON: prototype.buffer.from_bytes( - json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode() - ), - ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(self.attributes, indent=json_indent).encode() - ), - } - if self.consolidated_metadata: - d = { - ZGROUP_JSON: {"zarr_format": self.zarr_format}, - ZATTRS_JSON: self.attributes, - } - consolidated_metadata = self.consolidated_metadata.to_dict()["metadata"] - assert isinstance(consolidated_metadata, dict) - for k, v in consolidated_metadata.items(): - attrs = v.pop("attributes", None) - d[f"{k}/{ZATTRS_JSON}"] = _replace_special_floats(attrs) - if "shape" in v: - # it's an array - if isinstance(v.get("fill_value", None), np.void): - v["fill_value"] = base64.standard_b64encode( - cast(bytes, v["fill_value"]) - ).decode("ascii") - else: - v = _replace_special_floats(v) - d[f"{k}/{ZARRAY_JSON}"] = v - else: - d[f"{k}/{ZGROUP_JSON}"] = { - "zarr_format": self.zarr_format, - "consolidated_metadata": { - "metadata": {}, - "must_understand": False, - "kind": "inline", - }, - } - - items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( - json.dumps( - {"metadata": d, "zarr_consolidated_format": 1}, - cls=V3JsonEncoder, - ).encode() - ) - - return items - - def __init__( - self, - attributes: dict[str, Any] | None = None, - zarr_format: ZarrFormat = 3, - consolidated_metadata: ConsolidatedMetadata | None = None, - ) -> None: - attributes_parsed = parse_attributes(attributes) - zarr_format_parsed = parse_zarr_format(zarr_format) - - object.__setattr__(self, "attributes", attributes_parsed) - object.__setattr__(self, "zarr_format", zarr_format_parsed) - object.__setattr__(self, "consolidated_metadata", consolidated_metadata) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> GroupMetadata: - data = dict(data) - assert data.pop("node_type", None) in ("group", None) - consolidated_metadata = data.pop("consolidated_metadata", None) - if consolidated_metadata: - data["consolidated_metadata"] = ConsolidatedMetadata.from_dict(consolidated_metadata) - - zarr_format = data.get("zarr_format") - if zarr_format == 2 or zarr_format is None: - # zarr v2 allowed arbitrary keys here. - # We don't want the GroupMetadata constructor to fail just because someone put an - # extra key in the metadata. - expected = {x.name for x in fields(cls)} - data = {k: v for k, v in data.items() if k in expected} - - return cls(**data) - - def to_dict(self) -> dict[str, Any]: - result = asdict(replace(self, consolidated_metadata=None)) - if self.consolidated_metadata: - result["consolidated_metadata"] = self.consolidated_metadata.to_dict() - return result - - -@dataclass(frozen=True) -class ImplicitGroupMarker(GroupMetadata): - """ - Marker for an implicit group. Instances of this class are only used in the context of group - creation as a placeholder to represent groups that should only be created if they do not - already exist in storage - """ - - @dataclass(frozen=True) class AsyncGroup: """ diff --git a/src/zarr/core/metadata/__init__.py b/src/zarr/core/metadata/__init__.py index 43b5ec98fe..a0aa0f48ed 100644 --- a/src/zarr/core/metadata/__init__.py +++ b/src/zarr/core/metadata/__init__.py @@ -1,5 +1,6 @@ from typing import TypeAlias, TypeVar +from .group import ConsolidatedMetadata, GroupMetadata, ImplicitGroupMarker from .v2 import ArrayV2Metadata, ArrayV2MetadataDict from .v3 import ArrayV3Metadata, ArrayV3MetadataDict @@ -14,4 +15,7 @@ "ArrayV2MetadataDict", "ArrayV3Metadata", "ArrayV3MetadataDict", + "ConsolidatedMetadata", + "GroupMetadata", + "ImplicitGroupMarker", ] diff --git a/src/zarr/core/metadata/group.py b/src/zarr/core/metadata/group.py new file mode 100644 index 0000000000..23a8b80bd6 --- /dev/null +++ b/src/zarr/core/metadata/group.py @@ -0,0 +1,369 @@ +from __future__ import annotations + +import base64 +import itertools +import json +from dataclasses import asdict, dataclass, field, fields, replace +from typing import TYPE_CHECKING, Literal, assert_never, cast + +import numpy as np + +from zarr.abc.metadata import Metadata +from zarr.core.common import ( + JSON, + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, + ZGROUP_JSON, + ZMETADATA_V2_JSON, + NodeType, + ZarrFormat, +) +from zarr.core.config import config +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ( + ArrayV3Metadata, + V3JsonEncoder, + _replace_special_floats, +) +from zarr.errors import MetadataValidationError + +if TYPE_CHECKING: + from typing import Any + + from zarr.core.buffer import Buffer, BufferPrototype + + +def parse_zarr_format(data: Any) -> ZarrFormat: + """Parse the zarr_format field from metadata.""" + if data in (2, 3): + return cast(ZarrFormat, data) + msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." + raise ValueError(msg) + + +def parse_node_type(data: Any) -> NodeType: + """Parse the node_type field from metadata.""" + if data in ("array", "group"): + return cast(Literal["array", "group"], data) + raise MetadataValidationError("node_type", "array or group", data) + + +# todo: convert None to empty dict +def parse_attributes(data: Any) -> dict[str, Any]: + """Parse the attributes field from metadata.""" + if data is None: + return {} + elif isinstance(data, dict) and all(isinstance(k, str) for k in data): + return data + msg = f"Expected dict with string keys. Got {type(data)} instead." + raise TypeError(msg) + + +@dataclass(frozen=True) +class GroupMetadata(Metadata): + """ + Metadata for a Group. + """ + + attributes: dict[str, Any] = field(default_factory=dict) + zarr_format: ZarrFormat = 3 + consolidated_metadata: ConsolidatedMetadata | None = None + node_type: Literal["group"] = field(default="group", init=False) + + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + json_indent = config.get("json_indent") + if self.zarr_format == 3: + return { + ZARR_JSON: prototype.buffer.from_bytes( + json.dumps(_replace_special_floats(self.to_dict()), cls=V3JsonEncoder).encode() + ) + } + else: + items = { + ZGROUP_JSON: prototype.buffer.from_bytes( + json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode() + ), + ZATTRS_JSON: prototype.buffer.from_bytes( + json.dumps(self.attributes, indent=json_indent).encode() + ), + } + if self.consolidated_metadata: + d = { + ZGROUP_JSON: {"zarr_format": self.zarr_format}, + ZATTRS_JSON: self.attributes, + } + consolidated_metadata = self.consolidated_metadata.to_dict()["metadata"] + assert isinstance(consolidated_metadata, dict) + for k, v in consolidated_metadata.items(): + attrs = v.pop("attributes", None) + d[f"{k}/{ZATTRS_JSON}"] = _replace_special_floats(attrs) + if "shape" in v: + # it's an array + if isinstance(v.get("fill_value", None), np.void): + v["fill_value"] = base64.standard_b64encode( + cast(bytes, v["fill_value"]) + ).decode("ascii") + else: + v = _replace_special_floats(v) + d[f"{k}/{ZARRAY_JSON}"] = v + else: + d[f"{k}/{ZGROUP_JSON}"] = { + "zarr_format": self.zarr_format, + "consolidated_metadata": { + "metadata": {}, + "must_understand": False, + "kind": "inline", + }, + } + + items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( + json.dumps( + {"metadata": d, "zarr_consolidated_format": 1}, + cls=V3JsonEncoder, + ).encode() + ) + + return items + + def __init__( + self, + attributes: dict[str, Any] | None = None, + zarr_format: ZarrFormat = 3, + consolidated_metadata: ConsolidatedMetadata | None = None, + ) -> None: + attributes_parsed = parse_attributes(attributes) + zarr_format_parsed = parse_zarr_format(zarr_format) + + object.__setattr__(self, "attributes", attributes_parsed) + object.__setattr__(self, "zarr_format", zarr_format_parsed) + object.__setattr__(self, "consolidated_metadata", consolidated_metadata) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> GroupMetadata: + data = dict(data) + assert data.pop("node_type", None) in ("group", None) + consolidated_metadata = data.pop("consolidated_metadata", None) + if consolidated_metadata: + data["consolidated_metadata"] = ConsolidatedMetadata.from_dict(consolidated_metadata) + + zarr_format = data.get("zarr_format") + if zarr_format == 2 or zarr_format is None: + # zarr v2 allowed arbitrary keys here. + # We don't want the GroupMetadata constructor to fail just because someone put an + # extra key in the metadata. + expected = {x.name for x in fields(cls)} + data = {k: v for k, v in data.items() if k in expected} + + return cls(**data) + + def to_dict(self) -> dict[str, Any]: + result = asdict(replace(self, consolidated_metadata=None)) + if self.consolidated_metadata: + result["consolidated_metadata"] = self.consolidated_metadata.to_dict() + return result + + +@dataclass(frozen=True) +class ConsolidatedMetadata: + """ + Consolidated Metadata for this Group. + + This stores the metadata of child nodes below this group. Any child groups + will have their consolidated metadata set appropriately. + """ + + metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] + kind: Literal["inline"] = "inline" + must_understand: Literal[False] = False + + def to_dict(self) -> dict[str, JSON]: + return { + "kind": self.kind, + "must_understand": self.must_understand, + "metadata": {k: v.to_dict() for k, v in self.flattened_metadata.items()}, + } + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata: + data = dict(data) + + kind = data.get("kind") + if kind != "inline": + raise ValueError(f"Consolidated metadata kind='{kind}' is not supported.") + + raw_metadata = data.get("metadata") + if not isinstance(raw_metadata, dict): + raise TypeError(f"Unexpected type for 'metadata': {type(raw_metadata)}") + + metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {} + if raw_metadata: + for k, v in raw_metadata.items(): + if not isinstance(v, dict): + raise TypeError( + f"Invalid value for metadata items. key='{k}', type='{type(v).__name__}'" + ) + + # zarr_format is present in v2 and v3. + zarr_format = parse_zarr_format(v["zarr_format"]) + + if zarr_format == 3: + node_type = parse_node_type(v.get("node_type", None)) + if node_type == "group": + metadata[k] = GroupMetadata.from_dict(v) + elif node_type == "array": + metadata[k] = ArrayV3Metadata.from_dict(v) + else: + assert_never(node_type) + elif zarr_format == 2: + if "shape" in v: + metadata[k] = ArrayV2Metadata.from_dict(v) + else: + metadata[k] = GroupMetadata.from_dict(v) + else: + assert_never(zarr_format) + + cls._flat_to_nested(metadata) + + return cls(metadata=metadata) + + @staticmethod + def _flat_to_nested( + metadata: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata], + ) -> None: + """ + Convert a flat metadata representation to a nested one. + + Notes + ----- + Flat metadata is used when persisting the consolidated metadata. The keys + include the full path, not just the node name. The key prefixes can be + used to determine which nodes are children of which other nodes. + + Nested metadata is used in-memory. The outermost level will only have the + *immediate* children of the Group. All nested child groups will be stored + under the consolidated metadata of their immediate parent. + """ + # We have a flat mapping from {k: v} where the keys include the *full* + # path segment: + # { + # "/a/b": { group_metadata }, + # "/a/b/array-0": { array_metadata }, + # "/a/b/array-1": { array_metadata }, + # } + # + # We want to reorganize the metadata such that each Group contains the + # array metadata of its immediate children. + # In the example, the group at `/a/b` will have consolidated metadata + # for its children `array-0` and `array-1`. + # + # metadata = dict(metadata) + + keys = sorted(metadata, key=lambda k: k.count("/")) + grouped = { + k: list(v) for k, v in itertools.groupby(keys, key=lambda k: k.rsplit("/", 1)[0]) + } + + # we go top down and directly manipulate metadata. + for key, children_keys in grouped.items(): + # key is a key like "a", "a/b", "a/b/c" + # The basic idea is to find the immediate parent (so "", "a", or "a/b") + # and update that node's consolidated metadata to include the metadata + # in children_keys + *prefixes, name = key.split("/") + parent = metadata + + while prefixes: + # e.g. a/b/c has a parent "a/b". Walk through to get + # metadata["a"]["b"] + part = prefixes.pop(0) + # we can assume that parent[part] here is a group + # otherwise we wouldn't have a node with this `part` prefix. + # We can also assume that the parent node will have consolidated metadata, + # because we're walking top to bottom. + parent = parent[part].consolidated_metadata.metadata # type: ignore[union-attr] + + node = parent[name] + children_keys = list(children_keys) + + if isinstance(node, ArrayV2Metadata | ArrayV3Metadata): + # These are already present, either thanks to being an array in the + # root, or by being collected as a child in the else clause + continue + children_keys = list(children_keys) + # We pop from metadata, since we're *moving* this under group + children = { + child_key.split("/")[-1]: metadata.pop(child_key) + for child_key in children_keys + if child_key != key + } + parent[name] = replace( + node, consolidated_metadata=ConsolidatedMetadata(metadata=children) + ) + + @property + def flattened_metadata(self) -> dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata]: + """ + Return the flattened representation of Consolidated Metadata. + + The returned dictionary will have a key for each child node in the hierarchy + under this group. Under the default (nested) representation available through + ``self.metadata``, the dictionary only contains keys for immediate children. + + The keys of the dictionary will include the full path to a child node from + the current group, where segments are joined by ``/``. + + Examples + -------- + >>> cm = ConsolidatedMetadata( + ... metadata={ + ... "group-0": GroupMetadata( + ... consolidated_metadata=ConsolidatedMetadata( + ... { + ... "group-0-0": GroupMetadata(), + ... } + ... ) + ... ), + ... "group-1": GroupMetadata(), + ... } + ... ) + {'group-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), + 'group-0/group-0-0': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), + 'group-1': GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group')} + """ + metadata = {} + + def flatten( + key: str, group: GroupMetadata | ArrayV2Metadata | ArrayV3Metadata + ) -> dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata]: + children: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {} + if isinstance(group, ArrayV2Metadata | ArrayV3Metadata): + children[key] = group + else: + if group.consolidated_metadata and group.consolidated_metadata.metadata is not None: + children[key] = replace( + group, consolidated_metadata=ConsolidatedMetadata(metadata={}) + ) + for name, val in group.consolidated_metadata.metadata.items(): + full_key = f"{key}/{name}" + if isinstance(val, GroupMetadata): + children.update(flatten(full_key, val)) + else: + children[full_key] = val + else: + children[key] = replace(group, consolidated_metadata=None) + return children + + for k, v in self.metadata.items(): + metadata.update(flatten(k, v)) + + return metadata + + +@dataclass(frozen=True) +class ImplicitGroupMarker(GroupMetadata): + """ + Marker for an implicit group. Instances of this class are only used in the context of group + creation as a placeholder to represent groups that should only be created if they do not + already exist in storage + """ diff --git a/src/zarr/core/sync_group.py b/src/zarr/core/sync_group.py index 39d8a17992..3cddb4e080 100644 --- a/src/zarr/core/sync_group.py +++ b/src/zarr/core/sync_group.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from zarr.core.group import Group, GroupMetadata, _parse_async_node +from zarr.core.group import Group, _parse_async_node from zarr.core.group import create_hierarchy as create_hierarchy_async from zarr.core.group import create_nodes as create_nodes_async from zarr.core.group import create_rooted_hierarchy as create_rooted_hierarchy_async @@ -15,7 +15,7 @@ from zarr.abc.store import Store from zarr.core.array import Array from zarr.core.common import ZarrFormat - from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata + from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata, GroupMetadata def create_nodes( diff --git a/tests/test_group.py b/tests/test_group.py index b4dace2568..150b6dc612 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -24,9 +24,6 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config from zarr.core.group import ( - ConsolidatedMetadata, - GroupMetadata, - ImplicitGroupMarker, _build_metadata_v3, _get_roots, _parse_hierarchy_dict, @@ -35,7 +32,12 @@ create_rooted_hierarchy, get_node, ) -from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.metadata import ( + ArrayV3Metadata, + ConsolidatedMetadata, + GroupMetadata, + ImplicitGroupMarker, +) from zarr.core.sync import _collect_aiterator, sync from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataValidationError from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index a179982e94..f4b1d3992a 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -18,8 +18,7 @@ open_consolidated, ) from zarr.core.buffer import cpu, default_buffer_prototype -from zarr.core.group import ConsolidatedMetadata, GroupMetadata -from zarr.core.metadata import ArrayV3Metadata +from zarr.core.metadata import ArrayV3Metadata, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import StorePath diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 08b9cb2507..80ab3d7ff6 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -10,8 +10,7 @@ import zarr.storage from zarr.core.buffer import cpu from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.group import ConsolidatedMetadata, GroupMetadata -from zarr.core.metadata import ArrayV2Metadata +from zarr.core.metadata import ArrayV2Metadata, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index a47cbf43bb..e4d765e3dc 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -11,7 +11,7 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config -from zarr.core.group import GroupMetadata, parse_node_type +from zarr.core.metadata.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, DataType,