Skip to content

Commit f4f8200

Browse files
committed
Fixed consolidated Group getitem with multi-part key
This fixes `Group.__getitem__` when indexing with a key like 'subgroup/array'. The basic idea is to rewrite the indexing operation as `group['subgroup']['array']` by splitting the key and doing each operation independently. This is fine for consolidated metadata which doesn't need to do IO. There's a complication around unconsolidated metadata, though. What if we encounter a node where `Group.getitem` returns a sub Group without consolidated metadata. Then we need to fall back to non-consolidated metadata. We've written _getitem_consolidated as a regular (non-async) function so we need to pop back up to the async caller and have *it* fall back. Closes #2358
1 parent 9bbfd88 commit f4f8200

File tree

2 files changed

+109
-25
lines changed

2 files changed

+109
-25
lines changed

src/zarr/core/group.py

Lines changed: 95 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,24 @@ def _parse_async_node(
9797
raise TypeError(f"Unknown node type, got {type(node)}")
9898

9999

100+
class _MixedConsolidatedMetadataException(Exception):
101+
"""
102+
A custom, *internal* exception for when we encounter mixed consolidated metadata.
103+
104+
Typically, Consolidated Metadata will explicitly indicate that there are no
105+
additional children under a group with ``ConsolidatedMetadata(metadata={})``,
106+
as opposed to ``metadata=None``. This is the behavior of ``consolidated_metadata``.
107+
We rely on that "fact" to do I/O-free getitem: when a group's consolidated metadata
108+
doesn't contain a child we can raise a ``KeyError`` without consulting the backing
109+
store.
110+
111+
Users can potentially get themselves in situations where there's "mixed" consolidated
112+
metadata. For now, we'll raise this error, catch it internally, and silently fall back
113+
to the store (which will either succeed or raise its own KeyError, slowly). We might
114+
want to expose this in the future, in which case rename it add it to zarr.errors.
115+
"""
116+
117+
100118
@dataclass(frozen=True)
101119
class ConsolidatedMetadata:
102120
"""
@@ -571,7 +589,10 @@ def _from_bytes_v2(
571589

572590
@classmethod
573591
def _from_bytes_v3(
574-
cls, store_path: StorePath, zarr_json_bytes: Buffer, use_consolidated: bool | None
592+
cls,
593+
store_path: StorePath,
594+
zarr_json_bytes: Buffer,
595+
use_consolidated: bool | None,
575596
) -> AsyncGroup:
576597
group_metadata = json.loads(zarr_json_bytes.to_bytes())
577598
if use_consolidated and group_metadata.get("consolidated_metadata") is None:
@@ -604,14 +625,22 @@ async def getitem(
604625

605626
# Consolidated metadata lets us avoid some I/O operations so try that first.
606627
if self.metadata.consolidated_metadata is not None:
607-
return self._getitem_consolidated(store_path, key, prefix=self.name)
628+
try:
629+
return self._getitem_consolidated(store_path, key, prefix=self.name)
630+
except _MixedConsolidatedMetadataException:
631+
logger.info(
632+
"Mixed consolidated and unconsolidated metadata. key=%s, store_path=%s",
633+
key,
634+
store_path,
635+
)
636+
# now fall back to the non-consolidated variant
608637

609638
# Note:
610639
# in zarr-python v2, we first check if `key` references an Array, else if `key` references
611640
# a group,using standalone `contains_array` and `contains_group` functions. These functions
612641
# are reusable, but for v3 they would perform redundant I/O operations.
613642
# Not clear how much of that strategy we want to keep here.
614-
elif self.metadata.zarr_format == 3:
643+
if self.metadata.zarr_format == 3:
615644
zarr_json_bytes = await (store_path / ZARR_JSON).get()
616645
if zarr_json_bytes is None:
617646
raise KeyError(key)
@@ -661,18 +690,39 @@ def _getitem_consolidated(
661690
# getitem, in the special case where we have consolidated metadata.
662691
# Note that this is a regular def (non async) function.
663692
# This shouldn't do any additional I/O.
693+
# All callers *must* catch _MixedConsolidatedMetadataException to ensure
694+
# that we correctly handle the case where we need to fall back to doing
695+
# additional I/O.
664696

665697
# the caller needs to verify this!
666698
assert self.metadata.consolidated_metadata is not None
667699

668-
try:
669-
metadata = self.metadata.consolidated_metadata.metadata[key]
670-
except KeyError as e:
671-
# The Group Metadata has consolidated metadata, but the key
672-
# isn't present. We trust this to mean that the key isn't in
673-
# the hierarchy, and *don't* fall back to checking the store.
674-
msg = f"'{key}' not found in consolidated metadata."
675-
raise KeyError(msg) from e
700+
# we support nested getitems like group/subgroup/array
701+
indexers = key.split("/")
702+
indexers.reverse()
703+
metadata: ArrayV2Metadata | ArrayV3Metadata | GroupMetadata = self.metadata
704+
705+
while indexers:
706+
indexer = indexers.pop()
707+
if isinstance(metadata, ArrayV2Metadata | ArrayV3Metadata):
708+
# we've indexed into an array with group["array/subarray"]. Invalid.
709+
raise KeyError(key)
710+
try:
711+
if metadata.consolidated_metadata is None:
712+
# we've indexed into a group without consolidated metadata.
713+
# Note that the `None` case is different from `metadata={}`
714+
# where we explicitly know we have no children. In the None
715+
# case we have to fall back to non-consolidated metadata.
716+
raise _MixedConsolidatedMetadataException(key)
717+
assert metadata.consolidated_metadata is not None
718+
719+
metadata = metadata.consolidated_metadata.metadata[indexer]
720+
except KeyError as e:
721+
# The Group Metadata has consolidated metadata, but the key
722+
# isn't present. We trust this to mean that the key isn't in
723+
# the hierarchy, and *don't* fall back to checking the store.
724+
msg = f"'{key}' not found in consolidated metadata."
725+
raise KeyError(msg) from e
676726

677727
# update store_path to ensure that AsyncArray/Group.name is correct
678728
if prefix != "/":
@@ -1087,7 +1137,8 @@ async def members(
10871137
self,
10881138
max_depth: int | None = 0,
10891139
) -> AsyncGenerator[
1090-
tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None
1140+
tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup],
1141+
None,
10911142
]:
10921143
"""
10931144
Returns an AsyncGenerator over the arrays and groups contained in this group.
@@ -1118,15 +1169,20 @@ async def members(
11181169
async def _members(
11191170
self, max_depth: int | None, current_depth: int
11201171
) -> AsyncGenerator[
1121-
tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None
1172+
tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup],
1173+
None,
11221174
]:
11231175
if self.metadata.consolidated_metadata is not None:
11241176
# we should be able to do members without any additional I/O
1125-
members = self._members_consolidated(max_depth, current_depth)
1126-
1127-
for member in members:
1128-
yield member
1129-
return
1177+
try:
1178+
members = self._members_consolidated(max_depth, current_depth)
1179+
except _MixedConsolidatedMetadataException:
1180+
# we've already logged this. We'll fall back to the non-consolidated version.
1181+
pass
1182+
else:
1183+
for member in members:
1184+
yield member
1185+
return
11301186

11311187
if not self.store_path.store.supports_listing:
11321188
msg = (
@@ -1177,17 +1233,28 @@ async def _members(
11771233
def _members_consolidated(
11781234
self, max_depth: int | None, current_depth: int, prefix: str = ""
11791235
) -> Generator[
1180-
tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None
1236+
tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup],
1237+
None,
11811238
]:
11821239
consolidated_metadata = self.metadata.consolidated_metadata
11831240

11841241
# we kind of just want the top-level keys.
11851242
if consolidated_metadata is not None:
11861243
for key in consolidated_metadata.metadata.keys():
1187-
obj = self._getitem_consolidated(
1188-
self.store_path, key, prefix=self.name
1189-
) # Metadata -> Group/Array
1190-
key = f"{prefix}/{key}".lstrip("/")
1244+
try:
1245+
obj = self._getitem_consolidated(
1246+
self.store_path, key, prefix=self.name
1247+
) # Metadata -> Group/Array
1248+
key = f"{prefix}/{key}".lstrip("/")
1249+
except _MixedConsolidatedMetadataException:
1250+
logger.info(
1251+
"Mixed consolidated and unconsolidated metadata. key=%s, depth=%d, prefix=%s",
1252+
key,
1253+
current_depth,
1254+
prefix,
1255+
)
1256+
# This isn't an async def function so we need to re-raise up one more level.
1257+
raise
11911258
yield key, obj
11921259

11931260
if ((max_depth is None) or (current_depth < max_depth)) and isinstance(
@@ -1262,7 +1329,11 @@ async def full(
12621329
self, *, name: str, shape: ChunkCoords, fill_value: Any | None, **kwargs: Any
12631330
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
12641331
return await async_api.full(
1265-
shape=shape, fill_value=fill_value, store=self.store_path, path=name, **kwargs
1332+
shape=shape,
1333+
fill_value=fill_value,
1334+
store=self.store_path,
1335+
path=name,
1336+
**kwargs,
12661337
)
12671338

12681339
async def empty_like(

tests/v3/test_group.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,18 +300,31 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool
300300
group = Group.from_store(store, zarr_format=zarr_format)
301301
subgroup = group.create_group(name="subgroup")
302302
subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,))
303+
subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,))
303304

304305
if consolidated:
305306
group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format)
307+
# we're going to assume that `group.metadata` is correct, and reuse that to focus
308+
# on indexing in this test. Other tests verify the correctness of group.metadata
306309
object.__setattr__(
307-
subgroup.metadata, "consolidated_metadata", ConsolidatedMetadata(metadata={})
310+
subgroup.metadata,
311+
"consolidated_metadata",
312+
ConsolidatedMetadata(
313+
metadata={"subarray": group.metadata.consolidated_metadata.metadata["subarray"]}
314+
),
308315
)
309316

310317
assert group["subgroup"] == subgroup
311318
assert group["subarray"] == subarray
319+
assert subgroup["subarray"] == subsubarray
320+
# assert group["subgroup/subarray"] == subsubarray
321+
312322
with pytest.raises(KeyError):
313323
group["nope"]
314324

325+
with pytest.raises(KeyError, match="subarray/subsubarray"):
326+
group["subarray/subsubarray"]
327+
315328

316329
def test_group_get_with_default(store: Store, zarr_format: ZarrFormat) -> None:
317330
group = Group.from_store(store, zarr_format=zarr_format)

0 commit comments

Comments
 (0)