Skip to content

feat: Implement ZEP 8 URL syntax support for zarr-python #3369

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions examples/zep8_url_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""
ZEP 8 URL Syntax Demo

This example demonstrates the new ZEP 8 URL syntax support in zarr-python.
ZEP 8 URLs allow chaining multiple storage adapters using the pipe (|) character.

Examples:
- file:/tmp/data.zip|zip: # Access ZIP file
- s3://bucket/data.zip|zip:|zarr3: # S3 → ZIP → Zarr v3
- memory:|zarr2:group/array # Memory → Zarr v2
"""

import tempfile
import zipfile
from pathlib import Path

import numpy as np

import zarr


def demo_basic_zep8() -> None:
"""Demonstrate basic ZEP 8 URL syntax."""
print("=== Basic ZEP 8 URL Demo ===")

# Create some test data in memory
print("1. Creating test data with memory: URL")
arr1 = zarr.open_array("memory:test1", mode="w", shape=(5,), dtype="i4")
arr1[:] = [1, 2, 3, 4, 5]
print(f"Created array: {list(arr1[:])}")

# Read it back
arr1_read = zarr.open_array("memory:test1", mode="r")
print(f"Read array: {list(arr1_read[:])}")
print()


def demo_zip_chaining() -> None:
"""Demonstrate ZIP file chaining with ZEP 8."""
print("=== ZIP Chaining Demo ===")

with tempfile.TemporaryDirectory() as tmpdir:
zip_path = Path(tmpdir) / "test_data.zip"

# Create a ZIP file with some zarr data
print(f"2. Creating ZIP file at {zip_path}")
with zipfile.ZipFile(zip_path, "w") as zf:
# Create some test array data manually
array_data = np.array([10, 20, 30, 40, 50])
zf.writestr("array/data", array_data.tobytes())

# Basic metadata (simplified)
metadata = {
"zarr_format": 3,
"shape": [5],
"chunk_grid": {"type": "regular", "chunk_shape": [5]},
"data_type": {"name": "int64", "endian": "little"},
"codecs": [{"name": "bytes", "endian": "little"}],
}
zf.writestr("array/zarr.json", str(metadata).replace("'", '"'))

print(f"Created ZIP file: {zip_path}")

# Now access via ZEP 8 URL
print("3. Accessing ZIP contents via ZEP 8 URL")
try:
zip_url = f"file:{zip_path}|zip:"
print(f"Using URL: {zip_url}")

# List contents (this would work with a proper zarr structure)
store = zarr.storage.ZipStore(zip_path)
print(f"ZIP contents: {list(store.list())}")

print("✅ ZIP chaining demo completed successfully")
except Exception as e:
print(f"Note: {e}")
print("(ZIP chaining requires proper zarr metadata structure)")
print()


def demo_format_specification() -> None:
"""Demonstrate zarr format specification in URLs."""
print("=== Zarr Format Specification Demo ===")

# Create arrays with different zarr formats via URL
print("4. Creating arrays with zarr format specifications")

try:
# Zarr v3 format (explicitly specified)
arr_v3 = zarr.open_array("memory:test_v3|zarr3:", mode="w", shape=(3,), dtype="f4")
arr_v3[:] = [1.1, 2.2, 3.3]
print(f"Zarr v3 array: {list(arr_v3[:])}")

# Zarr v2 format (explicitly specified)
arr_v2 = zarr.open_array("memory:test_v2|zarr2:", mode="w", shape=(3,), dtype="f4")
arr_v2[:] = [4.4, 5.5, 6.6]
print(f"Zarr v2 array: {list(arr_v2[:])}")

print("✅ Format specification demo completed successfully")
except Exception as e:
print(f"Note: {e}")
print("(Format specification requires full ZEP 8 implementation)")
print()


def demo_complex_chaining() -> None:
"""Demonstrate complex store chaining."""
print("=== Complex Chaining Demo ===")

print("5. Complex chaining examples (conceptual)")

# These are examples of what ZEP 8 enables:
examples = [
"s3://mybucket/data.zip|zip:subdir/|zarr3:",
"https://example.com/dataset.tar.gz|tar.gz:|zarr2:group/array",
"file:/data/archive.7z|7z:experiments/|zarr3:results",
"memory:cache|zarr3:temp/analysis",
]

for example in examples:
print(f" {example}")

print("These URLs demonstrate the power of ZEP 8:")
print(" - Chain multiple storage layers")
print(" - Specify zarr format versions")
print(" - Navigate within nested structures")
print(" - Support both local and remote sources")
print()


if __name__ == "__main__":
print("ZEP 8 URL Syntax Demo for zarr-python")
print("=" * 50)

demo_basic_zep8()
demo_zip_chaining()
demo_format_specification()
demo_complex_chaining()

print("Demo completed! 🎉")
print("\nZEP 8 URL syntax enables powerful storage chaining capabilities.")
print("See https://zarr-specs.readthedocs.io/en/zep8/zep8.html for full specification.")
3 changes: 3 additions & 0 deletions src/zarr/abc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from zarr.abc.store_adapter import StoreAdapter, URLSegment

__all__ = ["StoreAdapter", "URLSegment"]
196 changes: 196 additions & 0 deletions src/zarr/abc/store_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""
Store adapter interface for ZEP 8 URL syntax support.

This module defines the protocol that store implementations must follow
to be usable in ZEP 8 URL chains.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any

from zarr.abc.store import Store

__all__ = ["StoreAdapter", "URLSegment"]


@dataclass(frozen=True)
class URLSegment:
"""
Represents a segment in a ZEP 8 URL chain.

Examples:
- "zip:" -> URLSegment(scheme=None, adapter="zip", path="")
- "s3://bucket/data" -> URLSegment(scheme="s3", adapter=None, path="bucket/data")
- "zip:inner/path" -> URLSegment(scheme=None, adapter="zip", path="inner/path")
"""

scheme: str | None = None
"""The URL scheme (e.g., 's3', 'file', 'https') for the first segment."""

adapter: str | None = None
"""The store adapter name (e.g., 'zip', 'icechunk', 'zarr3')."""

path: str = ""
"""Path component for the segment."""

def __post_init__(self) -> None:
"""Validate the URL segment."""
import re

from zarr.storage._zep8 import ZEP8URLError

if not self.scheme and not self.adapter:
raise ZEP8URLError("URL segment must have either scheme or adapter")
if self.adapter and not re.match(r"^[a-zA-Z0-9][a-zA-Z0-9_-]*$", self.adapter):
raise ZEP8URLError(f"Invalid adapter name: {self.adapter}")


class StoreAdapter(ABC):
"""
Abstract base class for store adapters that can be resolved from ZEP 8 URLs.

Store adapters enable stores to participate in ZEP 8 URL chains by implementing
the from_url_segment class method. This allows stores to be created from URL
components and optionally wrap or chain with other stores.

Examples
--------
A memory adapter that creates in-memory storage:

>>> class MemoryAdapter(StoreAdapter):
... adapter_name = "memory"
...
... @classmethod
... async def from_url_segment(cls, segment, preceding_url, **kwargs):
... from zarr.storage import MemoryStore
... return await MemoryStore.open()

An icechunk adapter that uses native icechunk storage:

>>> class IcechunkAdapter(StoreAdapter):
... adapter_name = "icechunk"
...
... @classmethod
... async def from_url_segment(cls, segment, preceding_url, **kwargs):
... import icechunk
... if preceding_url.startswith('s3://'):
... storage = icechunk.s3_storage(bucket='...', prefix='...')
... elif preceding_url.startswith('file:'):
... storage = icechunk.local_filesystem_storage(path='...')
... repo = icechunk.Repository.open_existing(storage)
... return repo.readonly_session('main').store
"""

# Class-level registration info
adapter_name: str
"""The name used to identify this adapter in URLs (e.g., 'zip', 'icechunk')."""

@classmethod
@abstractmethod
async def from_url_segment(
cls,
segment: URLSegment,
preceding_url: str,
**kwargs: Any,
) -> Store:
"""
Create a store from a URL segment and preceding URL.

This method is the core of the store adapter interface. It receives
a URL segment and the full preceding URL, allowing each adapter to
use its native storage implementations.

Parameters
----------
segment : URLSegment
The URL segment containing adapter name and optional path.
preceding_url : str
The full URL before this adapter segment (e.g., 'file:/path', 's3://bucket/key').
This allows the adapter to use its native storage implementations.
**kwargs : Any
Additional keyword arguments from the URL resolution context,
such as storage_options, mode, etc.

Returns
-------
Store
A configured store instance ready for use.

Raises
------
ValueError
If required parameters are missing or invalid.
NotImplementedError
If the adapter cannot handle the given configuration.

Notes
-----
This design allows each adapter to interpret the preceding URL using its own
native storage backends. For example:
- Icechunk adapter can use icechunk.s3_storage() for s3:// URLs
- ZIP adapter can use fsspec for remote file access
- Each adapter maintains full control over its storage layer

Examples
--------
For URL "file:/tmp/repo|icechunk:branch:main":
- segment.adapter = "icechunk"
- segment.path = "branch:main"
- preceding_url = "file:/tmp/repo"
"""
...

@classmethod
def can_handle_scheme(cls, scheme: str) -> bool:
"""
Check if this adapter can handle a given URL scheme.

This method allows adapters to indicate they can handle
specific URL schemes directly, even when not in a ZEP 8 chain.

Parameters
----------
scheme : str
The URL scheme to check (e.g., 's3', 'https', 'file').

Returns
-------
bool
True if this adapter can handle the scheme.
"""
return False

@classmethod
def get_supported_schemes(cls) -> list[str]:
"""
Get list of URL schemes this adapter supports.

Returns
-------
list[str]
List of supported URL schemes.
"""
return []

def __init_subclass__(cls, **kwargs: Any) -> None:
"""Validate adapter implementation on subclass creation."""
super().__init_subclass__(**kwargs)

# Ensure adapter_name is defined
if not hasattr(cls, "adapter_name") or not cls.adapter_name:
raise TypeError(f"StoreAdapter subclass {cls.__name__} must define 'adapter_name'")

# Validate adapter_name format
if not isinstance(cls.adapter_name, str):
raise TypeError(f"adapter_name must be a string, got {type(cls.adapter_name)}")

import re

if not re.match(r"^[a-zA-Z][a-zA-Z0-9_-]*$", cls.adapter_name):
raise ValueError(f"Invalid adapter_name format: {cls.adapter_name}")
31 changes: 28 additions & 3 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
)
from zarr.storage import StorePath
from zarr.storage._common import make_store_path
from zarr.storage._zep8 import URLStoreResolver, is_zep8_url

if TYPE_CHECKING:
from collections.abc import Iterable
Expand All @@ -59,9 +60,33 @@
from zarr.core.chunk_key_encodings import ChunkKeyEncoding
from zarr.storage import StoreLike

# TODO: this type could use some more thought
ArrayLike = AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | Array | npt.NDArray[Any]
PathLike = str

def _parse_zep8_zarr_format(store: str) -> tuple[str, int | None]:
"""
Parse ZEP 8 URL to extract zarr format and return store without format.

Returns
-------
tuple[str, int | None]
(store_url_without_format, zarr_format)
"""
if not is_zep8_url(store):
return store, None

resolver = URLStoreResolver()
zarr_format = resolver.extract_zarr_format(store)

# Remove zarr format from URL for store creation
if zarr_format:
# Simple removal - in real implementation would properly parse/reconstruct
store_without_format = store.replace("|zarr2:", "").replace("|zarr3:", "")
return store_without_format, zarr_format

return store, None


ArrayLike = AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | Array | npt.NDArray[Any]
PathLike = str

__all__ = [
"array",
Expand Down
Loading
Loading