Skip to content

Commit 5533ac7

Browse files
committed
trust the metadata
1 parent 1cf5bd4 commit 5533ac7

File tree

2 files changed

+29
-91
lines changed

2 files changed

+29
-91
lines changed

src/zarr/core/group.py

Lines changed: 23 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -98,24 +98,6 @@ def _parse_async_node(
9898
raise TypeError(f"Unknown node type, got {type(node)}")
9999

100100

101-
class _MixedConsolidatedMetadataException(Exception):
102-
"""
103-
A custom, *internal* exception for when we encounter mixed consolidated metadata.
104-
105-
Typically, Consolidated Metadata will explicitly indicate that there are no
106-
additional children under a group with ``ConsolidatedMetadata(metadata={})``,
107-
as opposed to ``metadata=None``. This is the behavior of ``consolidated_metadata``.
108-
We rely on that "fact" to do I/O-free getitem: when a group's consolidated metadata
109-
doesn't contain a child we can raise a ``KeyError`` without consulting the backing
110-
store.
111-
112-
Users can potentially get themselves in situations where there's "mixed" consolidated
113-
metadata. For now, we'll raise this error, catch it internally, and silently fall back
114-
to the store (which will either succeed or raise its own KeyError, slowly). We might
115-
want to expose this in the future, in which case rename it add it to zarr.errors.
116-
"""
117-
118-
119101
@dataclass(frozen=True)
120102
class ConsolidatedMetadata:
121103
"""
@@ -626,15 +608,7 @@ async def getitem(
626608

627609
# Consolidated metadata lets us avoid some I/O operations so try that first.
628610
if self.metadata.consolidated_metadata is not None:
629-
try:
630-
return self._getitem_consolidated(store_path, key, prefix=self.name)
631-
except _MixedConsolidatedMetadataException:
632-
logger.info(
633-
"Mixed consolidated and unconsolidated metadata. key=%s, store_path=%s",
634-
key,
635-
store_path,
636-
)
637-
# now fall back to the non-consolidated variant
611+
return self._getitem_consolidated(store_path, key, prefix=self.name)
638612

639613
# Note:
640614
# in zarr-python v2, we first check if `key` references an Array, else if `key` references
@@ -691,9 +665,6 @@ def _getitem_consolidated(
691665
# getitem, in the special case where we have consolidated metadata.
692666
# Note that this is a regular def (non async) function.
693667
# This shouldn't do any additional I/O.
694-
# All callers *must* catch _MixedConsolidatedMetadataException to ensure
695-
# that we correctly handle the case where we need to fall back to doing
696-
# additional I/O.
697668

698669
# the caller needs to verify this!
699670
assert self.metadata.consolidated_metadata is not None
@@ -708,14 +679,16 @@ def _getitem_consolidated(
708679
if isinstance(metadata, ArrayV2Metadata | ArrayV3Metadata):
709680
# we've indexed into an array with group["array/subarray"]. Invalid.
710681
raise KeyError(key)
682+
if metadata.consolidated_metadata is None:
683+
# we've indexed into a group without consolidated metadata.
684+
# This isn't normal; typically, consolidated metadata
685+
# will include explicit markers for when there are no child
686+
# nodes as metadata={}.
687+
# We have some freedom in exactly how we interpret this case.
688+
# For now, we treat None as the same as {}, i.e. we don't
689+
# have any children.
690+
raise KeyError(key)
711691
try:
712-
if metadata.consolidated_metadata is None:
713-
# we've indexed into a group without consolidated metadata.
714-
# Note that the `None` case is different from `metadata={}`
715-
# where we explicitly know we have no children. In the None
716-
# case we have to fall back to non-consolidated metadata.
717-
raise _MixedConsolidatedMetadataException(key)
718-
719692
metadata = metadata.consolidated_metadata.metadata[indexer]
720693
except KeyError as e:
721694
# The Group Metadata has consolidated metadata, but the key
@@ -981,11 +954,7 @@ async def create_array(
981954

982955
@deprecated("Use AsyncGroup.create_array instead.")
983956
async def create_dataset(
984-
self,
985-
name: str,
986-
*,
987-
shape: ShapeLike,
988-
**kwargs: Any,
957+
self, name: str, **kwargs: Any
989958
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
990959
"""Create an array.
991960
@@ -996,8 +965,6 @@ async def create_dataset(
996965
----------
997966
name : str
998967
Array name.
999-
shape : int or tuple of ints
1000-
Array shape.
1001968
kwargs : dict
1002969
Additional arguments passed to :func:`zarr.AsyncGroup.create_array`.
1003970
@@ -1008,7 +975,7 @@ async def create_dataset(
1008975
.. deprecated:: 3.0.0
1009976
The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead.
1010977
"""
1011-
return await self.create_array(name, shape=shape, **kwargs)
978+
return await self.create_array(name, **kwargs)
1012979

1013980
@deprecated("Use AsyncGroup.require_array instead.")
1014981
async def require_dataset(
@@ -1190,17 +1157,9 @@ async def _members(
11901157
if self.metadata.consolidated_metadata is not None:
11911158
# we should be able to do members without any additional I/O
11921159
members = self._members_consolidated(max_depth, current_depth)
1193-
1194-
try:
1195-
# we already have this in memory, so fine to build this list
1196-
# and catch the exception if needed.
1197-
members_ = list(members)
1198-
except _MixedConsolidatedMetadataException:
1199-
pass
1200-
else:
1201-
for member in members_:
1202-
yield member
1203-
return
1160+
members_ = list(members)
1161+
for member in members_:
1162+
yield member
12041163

12051164
if not self.store_path.store.supports_listing:
12061165
msg = (
@@ -1258,25 +1217,13 @@ def _members_consolidated(
12581217
]:
12591218
consolidated_metadata = self.metadata.consolidated_metadata
12601219

1261-
if consolidated_metadata is None:
1262-
raise _MixedConsolidatedMetadataException(prefix)
12631220
# we kind of just want the top-level keys.
12641221
if consolidated_metadata is not None:
12651222
for key in consolidated_metadata.metadata.keys():
1266-
try:
1267-
obj = self._getitem_consolidated(
1268-
self.store_path, key, prefix=self.name
1269-
) # Metadata -> Group/Array
1270-
key = f"{prefix}/{key}".lstrip("/")
1271-
except _MixedConsolidatedMetadataException:
1272-
logger.info(
1273-
"Mixed consolidated and unconsolidated metadata. key=%s, depth=%d, prefix=%s",
1274-
key,
1275-
current_depth,
1276-
prefix,
1277-
)
1278-
# This isn't an async def function so we need to re-raise up one more level.
1279-
raise
1223+
obj = self._getitem_consolidated(
1224+
self.store_path, key, prefix=self.name
1225+
) # Metadata -> Group/Array
1226+
key = f"{prefix}/{key}".lstrip("/")
12801227
yield key, obj
12811228

12821229
if ((max_depth is None) or (current_depth < max_depth)) and isinstance(
@@ -1711,13 +1658,7 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array:
17111658
return Array(self._sync(self._async_group.create_dataset(name, **kwargs)))
17121659

17131660
@deprecated("Use Group.require_array instead.")
1714-
def require_dataset(
1715-
self,
1716-
name: str,
1717-
*,
1718-
shape: ShapeLike,
1719-
**kwargs: Any,
1720-
) -> Array:
1661+
def require_dataset(self, name: str, **kwargs: Any) -> Array:
17211662
"""Obtain an array, creating if it doesn't exist.
17221663
17231664
Arrays are known as "datasets" in HDF5 terminology. For compatibility
@@ -1744,15 +1685,9 @@ def require_dataset(
17441685
.. deprecated:: 3.0.0
17451686
The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead.
17461687
"""
1747-
return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs)))
1688+
return Array(self._sync(self._async_group.require_array(name, **kwargs)))
17481689

1749-
def require_array(
1750-
self,
1751-
name: str,
1752-
*,
1753-
shape: ShapeLike,
1754-
**kwargs: Any,
1755-
) -> Array:
1690+
def require_array(self, name: str, **kwargs: Any) -> Array:
17561691
"""Obtain an array, creating if it doesn't exist.
17571692
17581693
@@ -1774,7 +1709,7 @@ def require_array(
17741709
-------
17751710
a : Array
17761711
"""
1777-
return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs)))
1712+
return Array(self._sync(self._async_group.require_array(name, **kwargs)))
17781713

17791714
def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array:
17801715
return Array(self._sync(self._async_group.empty(name=name, shape=shape, **kwargs)))

tests/v3/test_group.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from zarr.abc.store import Store
1616
from zarr.core.buffer import default_buffer_prototype
1717
from zarr.core.common import JSON, ZarrFormat
18-
from zarr.core.group import ConsolidatedMetadata, GroupMetadata, _MixedConsolidatedMetadataException
18+
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
1919
from zarr.core.sync import sync
2020
from zarr.errors import ContainsArrayError, ContainsGroupError
2121
from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore
@@ -347,12 +347,15 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool
347347
)
348348

349349
# test the implementation directly
350-
with pytest.raises(_MixedConsolidatedMetadataException):
350+
with pytest.raises(KeyError):
351351
group._async_group._getitem_consolidated(
352352
group.store_path, "subgroup/subarray", prefix="/"
353353
)
354354

355-
assert group["subgroup/subarray"] == subsubarray
355+
with pytest.raises(KeyError):
356+
# We've chosen to trust the consolidted metadata, which doesn't
357+
# contain this array
358+
group["subgroup/subarray"]
356359

357360
with pytest.raises(KeyError, match="subarray/subsubarray"):
358361
group["subarray/subsubarray"]

0 commit comments

Comments
 (0)