Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 45 additions & 43 deletions virtualizarr/parsers/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast

import numpy as np
import zarr
from obspec_utils.registry import ObjectStoreRegistry
from zarr.api.asynchronous import open_group as open_group_async
Expand All @@ -19,8 +20,10 @@
ManifestGroup,
ManifestStore,
)
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
from virtualizarr.vendor.zarr.core.common import _concurrent_map
from virtualizarr.manifests.manifest import (
parse_manifest_index,
validate_and_normalize_path_to_uri,
)

if TYPE_CHECKING:
import zarr
Expand Down Expand Up @@ -90,33 +93,43 @@ async def _handle_scalar_array(


async def _build_chunk_mapping(
chunk_keys: list[str], zarr_array: ZarrArrayType, path: str, prefix: str
zarr_array: ZarrArrayType, path: str, prefix: str
) -> dict[str, dict[str, Any]]:
"""
Build chunk mapping from a list of chunk keys.
Build chunk mapping by listing the object store with obstore.

Uses obstore's list_async with Arrow output to get chunk paths and sizes
in a single Rust-level call, avoiding per-chunk getsize calls.

Parameters
----------
chunk_keys
List of storage keys for chunks.
zarr_array
The Zarr array.
path
Base path for constructing chunk paths.
prefix
Prefix to strip from chunk keys.
Prefix to list and strip from chunk keys.

Returns
-------
dict
Mapping of normalized chunk coordinates to storage locations.
"""

size_map: dict[str, int] = {}
stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just grabbing the underlying obstore store is a interesting idea...

async for batch in stream:
size_map.update(
zip(batch.column("path").to_pylist(), batch.column("size").to_pylist())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this zipping of pylists creating a python dict? we want to avoid that

)

# filter out metadata files
chunk_keys = [k for k in size_map if not k.split("/")[-1].startswith(".")]

if not chunk_keys:
return {}

lengths = await _concurrent_map(
[(k,) for k in chunk_keys], zarr_array.store.getsize
)
lengths = [size_map[k] for k in chunk_keys]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we really want to work hard to avoid creating any python lists / dicts at all

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead we want obstore -> arrow -> numpy

via https://arrow.apache.org/docs/python/numpy.html#arrow-to-numpy

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the hardest part of this is dealing with logic for missing keys - arrow might return these a nulls, but the to_numpy conversion doesn't support nulls?

Any operations we do should either be as pyarrow arrays or as numpy arrays, never as python collections

dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
paths = [join_url(path, k) for k in chunk_keys]
offsets = [0] * len(lengths)
Expand Down Expand Up @@ -158,24 +171,7 @@ async def get_chunk_mapping(
scalar_key = f"{prefix}0"
return await _handle_scalar_array(zarr_array, path, scalar_key)

# List all keys under the array prefix, filtering out metadata files
prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
if not prefix_keys:
return {}

metadata_files = {".zarray", ".zattrs", ".zgroup", ".zmetadata"}
chunk_keys = []
for key_tuple in prefix_keys:
key = key_tuple[0]
file_name = (
key[len(prefix) :]
if prefix and key.startswith(prefix)
else key.split("/")[-1]
)
if file_name not in metadata_files:
chunk_keys.append(key)

return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
return await _build_chunk_mapping(zarr_array, path, prefix)

def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
"""Convert V2 metadata to V3 format."""
Expand Down Expand Up @@ -272,12 +268,7 @@ async def get_chunk_mapping(

# List chunk keys under the c/ subdirectory
prefix = f"{name}/c/" if name else "c/"
prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
if not prefix_keys:
return {}

chunk_keys = [x[0] for x in prefix_keys]
return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
return await _build_chunk_mapping(zarr_array, path, prefix)

def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
"""Return V3 metadata as-is (no conversion needed)."""
Expand Down Expand Up @@ -322,17 +313,28 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
"""
strategy = get_strategy(zarr_array)
chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
chunk_grid_shape = zarr_array._chunk_grid_shape

if not chunk_map:
import math

if zarr_array.shape and zarr_array.chunks:
chunk_grid_shape = tuple(
math.ceil(s / c) for s, c in zip(zarr_array.shape, zarr_array.chunks)
)
return ChunkManifest(chunk_map, shape=chunk_grid_shape)

return ChunkManifest(chunk_map)
return ChunkManifest(chunk_map, shape=chunk_grid_shape)

# Pre-allocate N-D numpy arrays shaped like the chunk grid.
# Empty string paths indicate missing chunks (sparse arrays).
paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))

for key, entry in chunk_map.items():
idx = parse_manifest_index(key)
paths_arr[idx] = entry["path"]
offsets_arr[idx] = entry["offset"]
lengths_arr[idx] = entry["length"]

return ChunkManifest.from_arrays(
paths=paths_arr,
offsets=offsets_arr,
lengths=lengths_arr,
)


def get_metadata(zarr_array: ZarrArrayType) -> ArrayV3Metadata:
Expand Down
64 changes: 28 additions & 36 deletions virtualizarr/tests/test_parsers/test_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,9 @@ def test_empty_array_chunk_mapping(tmpdir, zarr_format):
"""Test chunk mapping for arrays with no chunks written yet."""
import asyncio

# Create an array but don't write any data
from obstore.store import LocalStore as ObsLocalStore
from zarr.storage import ObjectStore

filepath = f"{tmpdir}/empty.zarr"
zarr.create(
shape=(10, 10),
Expand All @@ -188,12 +190,13 @@ def test_empty_array_chunk_mapping(tmpdir, zarr_format):
)

async def get_chunk_map():
zarr_array = await open_array(store=filepath, mode="r")
obs_store = ObsLocalStore(prefix=filepath)
zarr_store = ObjectStore(store=obs_store)
zarr_array = await open_array(store=zarr_store, mode="r")
strategy = get_strategy(zarr_array)
return await strategy.get_chunk_mapping(zarr_array, filepath)

chunk_map = asyncio.run(get_chunk_map())
# Empty arrays should return empty chunk map
assert chunk_map == {}


Expand Down Expand Up @@ -306,34 +309,31 @@ def test_build_chunk_manifest_empty_with_shape():
"""Test build_chunk_manifest when chunk_map is empty but array has shape and chunks."""
import asyncio

# Create an array but don't write data
store = zarr.storage.MemoryStore()
zarr.create(shape=(10, 10), chunks=(5, 5), dtype="int8", store=store, zarr_format=3)
from obstore.store import MemoryStore as ObsMemoryStore
from zarr.storage import ObjectStore

obs_store = ObsMemoryStore()
zarr_store = ObjectStore(store=obs_store)
zarr.create(
shape=(10, 10), chunks=(5, 5), dtype="int8", store=zarr_store, zarr_format=3
)

async def get_manifest():
zarr_array = await open_array(store=store, mode="r")
zarr_array = await open_array(store=zarr_store, mode="r")
return await build_chunk_manifest(zarr_array, "test://path")

manifest = asyncio.run(get_manifest())
# Should create manifest with proper chunk grid shape even if empty
assert manifest.shape_chunk_grid == (2, 2) # 10/5 = 2 chunks per dimension
assert manifest.shape_chunk_grid == (2, 2)


@zarr_versions()
def test_sparse_array_with_missing_chunks(tmpdir, zarr_format):
"""Test that arrays with some missing chunks (sparse arrays) are handled correctly.

This test verifies that VirtualiZarr correctly handles the case where some chunks
exist but others are missing. Zarr allows this for sparse data, and when chunks
are missing, Zarr returns the fill_value for those regions. VirtualiZarr should
preserve this sparsity in the manifest rather than generating entries for all
possible chunks based on the chunk grid.
"""
"""Test that arrays with some missing chunks (sparse arrays) are handled correctly."""
import asyncio

from virtualizarr.parsers.zarr import build_chunk_manifest
from obstore.store import LocalStore as ObsLocalStore
from zarr.storage import ObjectStore

# Create a zarr array with a 3x3 chunk grid (9 possible chunks)
filepath = f"{tmpdir}/sparse.zarr"
arr = zarr.create(
shape=(30, 30),
Expand All @@ -344,36 +344,28 @@ def test_sparse_array_with_missing_chunks(tmpdir, zarr_format):
fill_value=np.nan,
)

# Only write data to some chunks, leaving others missing (sparse)
# Write to chunks (0,0), (1,1), and (2,2) - a diagonal pattern
arr[0:10, 0:10] = 1.0 # chunk 0.0
arr[10:20, 10:20] = 2.0 # chunk 1.1
arr[20:30, 20:30] = 3.0 # chunk 2.2
# Chunks (0,1), (0,2), (1,0), (1,2), (2,0), (2,1) are intentionally left unwritten

async def get_manifest():
zarr_array = await open_array(store=filepath, mode="r")
obs_store = ObsLocalStore(prefix=filepath)
zarr_store = ObjectStore(store=obs_store)
zarr_array = await open_array(store=zarr_store, mode="r")
return await build_chunk_manifest(zarr_array, filepath)

manifest = asyncio.run(get_manifest())

# The manifest should only contain the 3 chunks we actually wrote
assert len(manifest.dict()) == 3, f"Expected 3 chunks, got {len(manifest.dict())}"

# Verify the expected chunks are present
assert "0.0" in manifest.dict(), "Chunk 0.0 should be present"
assert "1.1" in manifest.dict(), "Chunk 1.1 should be present"
assert "2.2" in manifest.dict(), "Chunk 2.2 should be present"
assert len(manifest.dict()) == 3
assert "0.0" in manifest.dict()
assert "1.1" in manifest.dict()
assert "2.2" in manifest.dict()

# Verify missing chunks are not in the manifest
missing_chunks = ["0.1", "0.2", "1.0", "1.2", "2.0", "2.1"]
for chunk_key in missing_chunks:
assert chunk_key not in manifest.dict(), (
f"Chunk {chunk_key} should not be present (it's missing/sparse)"
)
assert chunk_key not in manifest.dict()

# The chunk grid shape should still reflect the full array dimensions
assert manifest.shape_chunk_grid == (3, 3), "Chunk grid should be 3x3"
assert manifest.shape_chunk_grid == (3, 3)


@zarr_versions()
Expand Down
Empty file removed virtualizarr/vendor/__init__.py
Empty file.
Empty file.
Empty file.
34 changes: 0 additions & 34 deletions virtualizarr/vendor/zarr/core/common.py

This file was deleted.

Loading