diff --git a/changes/3288.misc.rst b/changes/3288.misc.rst new file mode 100644 index 0000000000..af7119487f --- /dev/null +++ b/changes/3288.misc.rst @@ -0,0 +1 @@ +Sort dictionary keys before returning consolidated metadata to ensure deterministic output. diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 4cd72dbc74..05a3aa7fb4 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -45,7 +45,7 @@ that can be used.: >>> consolidated = zarr.open_group(store=store) >>> consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata >>> from pprint import pprint - >>> pprint(dict(sorted(consolidated_metadata.items()))) + >>> pprint(dict(consolidated_metadata.items())) {'a': ArrayV3Metadata(shape=(1,), data_type=Float64(endianness='little'), chunk_grid=RegularChunkGrid(chunk_shape=(1,)), @@ -100,6 +100,14 @@ With nested groups, the consolidated metadata is available on the children, recu >>> consolidated['child'].metadata.consolidated_metadata ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata(metadata={}, kind='inline', must_understand=False), node_type='group')}, kind='inline', must_understand=False) +.. versionadded:: 3.1.1 + + The keys in the consolidated metadata are sorted prior to writing. Keys are + sorted in ascending order by path depth, where a path is defined as a sequence + of strings joined by ``"/"``. For keys with the same path length, lexicographic + order is used to break the tie. This behaviour ensures deterministic metadata + output for a given group. + Synchronization and Concurrency ------------------------------- diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index a398aa01aa..f18c723a76 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -4,6 +4,7 @@ import itertools import json import logging +import unicodedata import warnings from collections import defaultdict from dataclasses import asdict, dataclass, field, fields, replace @@ -141,7 +142,16 @@ def to_dict(self) -> dict[str, JSON]: return { "kind": self.kind, "must_understand": self.must_understand, - "metadata": {k: v.to_dict() for k, v in self.flattened_metadata.items()}, + "metadata": { + k: v.to_dict() + for k, v in sorted( + self.flattened_metadata.items(), + key=lambda item: ( + item[0].count("/"), + unicodedata.normalize("NFKC", item[0]).casefold(), + ), + ) + }, } @classmethod diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index ea2f834bb6..19eba4fb86 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -467,6 +467,35 @@ def test_to_dict_empty(self): } assert result == expected + @pytest.mark.parametrize("zarr_format", [2, 3]) + async def test_to_dict_order( + self, memory_store: zarr.storage.MemoryStore, zarr_format: ZarrFormat + ) -> None: + with zarr.config.set(default_zarr_format=zarr_format): + g = await group(store=memory_store) + + # Create groups in non-lexicographix order + dtype = "float32" + await g.create_array(name="b", shape=(1,), dtype=dtype) + child = await g.create_group("c", attributes={"key": "child"}) + await g.create_array(name="a", shape=(1,), dtype=dtype) + + await child.create_array("e", shape=(1,), dtype=dtype) + await child.create_array("d", shape=(1,), dtype=dtype) + + # Consolidate metadata and re-open store + await zarr.api.asynchronous.consolidate_metadata(memory_store) + g2 = await zarr.api.asynchronous.open_group(store=memory_store) + + assert list(g2.metadata.consolidated_metadata.metadata) == ["a", "b", "c"] + assert list(g2.metadata.consolidated_metadata.flattened_metadata) == [ + "a", + "b", + "c", + "c/d", + "c/e", + ] + @pytest.mark.parametrize("zarr_format", [2, 3]) async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): store = zarr.storage.MemoryStore()