From c84fe506b9352a7668b00b9f4f35a841710651ef Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 20 Jan 2025 14:30:22 -0600 Subject: [PATCH 1/3] DOC: Add memory-mapping example to storage guide Fixes #1245 Add documentation and tests for memory-mapped store, focusing on efficient access to small slices from large uncompressed chunks. --- docs/user-guide/storage.rst | 31 +++++++++++++ tests/test_store/test_mmap.py | 85 +++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 tests/test_store/test_mmap.py diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index 46505271b4..927cd7d3ba 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -99,6 +99,37 @@ Zarr data (metadata and chunks) to a dictionary.: >>> zarr.create_array(store=store, shape=(2,), dtype='float64') +Memory-Mapped Store +~~~~~~~~~~~~~~~~~~~~ + +For performance optimization when working with uncompressed data, you can create a memory-mapped store by subclassing :class:`zarr.storage.LocalStore`. +Memory mapping allows direct access to portions of chunk data without loading entire chunks into memory, which can be beneficial when you need to +read small slices from large chunks.: + + >>> import mmap + >>> from zarr.storage import LocalStore + >>> + >>> class MemoryMappedDirectoryStore(LocalStore): + ... def _fromfile(self, fn): + ... with open(fn, "rb") as fh: + ... return memoryview(mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ)) + >>> + >>> # Create a memory-mapped store + >>> store = MemoryMappedDirectoryStore('data/example.zarr') + >>> z = zarr.open_array(store=store) + +For example, if you have an array with large 1000x1000 chunks and frequently need to access small 100x100 sections, +memory mapping can provide efficient access by mapping only the needed portions into memory +rather than loading entire chunks.: + + >>> # Create an array with large chunks + >>> z = zarr.create_array('data/example.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='float64') + >>> # Later, open with memory mapping for efficient chunk access + >>> mmap_store = MemoryMappedDirectoryStore('data/example.zarr') + >>> z = zarr.open_array(store=mmap_store) + >>> # Access specific chunks efficiently + >>> chunk_data = z[500:600, 500:600] # Only maps the needed chunks into memory + .. _user-guide-custom-stores: Developing custom stores diff --git a/tests/test_store/test_mmap.py b/tests/test_store/test_mmap.py new file mode 100644 index 0000000000..0f7a81bc45 --- /dev/null +++ b/tests/test_store/test_mmap.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import mmap +import pytest + +import zarr +from zarr.core.buffer import Buffer, cpu +from zarr.storage import LocalStore +from zarr.testing.store import StoreTests + +if TYPE_CHECKING: + import pathlib + + +class MemoryMappedDirectoryStore(LocalStore): + def _fromfile(self, fn): + with open(fn, "rb") as fh: + return memoryview(mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ)) + + +class TestMemoryMappedDirectoryStore(StoreTests[MemoryMappedDirectoryStore, cpu.Buffer]): + store_cls = MemoryMappedDirectoryStore + buffer_cls = cpu.Buffer + + async def get(self, store: MemoryMappedDirectoryStore, key: str) -> Buffer: + return self.buffer_cls.from_bytes((store.root / key).read_bytes()) + + async def set(self, store: MemoryMappedDirectoryStore, key: str, value: Buffer) -> None: + parent = (store.root / key).parent + if not parent.exists(): + parent.mkdir(parents=True) + (store.root / key).write_bytes(value.to_bytes()) + + @pytest.fixture + def store_kwargs(self, tmpdir) -> dict[str, str]: + return {"root": str(tmpdir)} + + def test_store_repr(self, store: MemoryMappedDirectoryStore) -> None: + assert str(store) == f"file://{store.root.as_posix()}" + + def test_store_supports_writes(self, store: MemoryMappedDirectoryStore) -> None: + assert store.supports_writes + + def test_store_supports_partial_writes(self, store: MemoryMappedDirectoryStore) -> None: + assert store.supports_partial_writes + + def test_store_supports_listing(self, store: MemoryMappedDirectoryStore) -> None: + assert store.supports_listing + + async def test_empty_with_empty_subdir(self, store: MemoryMappedDirectoryStore) -> None: + assert await store.is_empty("") + (store.root / "foo/bar").mkdir(parents=True) + assert await store.is_empty("") + + def test_creates_new_directory(self, tmp_path: pathlib.Path): + target = tmp_path.joinpath("a", "b", "c") + assert not target.exists() + + store = self.store_cls(root=target) + zarr.group(store=store) + + async def test_mmap_slice_reads(self, store: MemoryMappedDirectoryStore) -> None: + """Test reading slices with memory mapping""" + # Create array with large chunks + z = zarr.create_array(store=store, shape=(2000, 2000), chunks=(1000, 1000), + dtype='float64') + # Write test data + data = zarr.full(shape=(2000, 2000), chunks=(1000, 1000), fill_value=42.0, + dtype='float64') + z[:] = data[:] + + # Test reading various slices + slices = [ + # Within single chunk + (slice(100, 200), slice(100, 200)), + # Across chunk boundaries + (slice(900, 1100), slice(900, 1100)), + # Full chunk + (slice(0, 1000), slice(0, 1000)) + ] + + for test_slice in slices: + assert (z[test_slice] == data[test_slice]).all() From 83a641844e5dd5481fd6583f801b14dfb7ec3e89 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 20 Jan 2025 14:40:00 -0600 Subject: [PATCH 2/3] style: fix pre-commit issues --- docs/user-guide/storage.rst | 6 +++--- tests/test_store/test_mmap.py | 16 +++++++--------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index 927cd7d3ba..bb7f9b9fe0 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -102,8 +102,8 @@ Zarr data (metadata and chunks) to a dictionary.: Memory-Mapped Store ~~~~~~~~~~~~~~~~~~~~ -For performance optimization when working with uncompressed data, you can create a memory-mapped store by subclassing :class:`zarr.storage.LocalStore`. -Memory mapping allows direct access to portions of chunk data without loading entire chunks into memory, which can be beneficial when you need to +For performance optimization when working with uncompressed data, you can create a memory-mapped store by subclassing :class:`zarr.storage.LocalStore`. +Memory mapping allows direct access to portions of chunk data without loading entire chunks into memory, which can be beneficial when you need to read small slices from large chunks.: >>> import mmap @@ -119,7 +119,7 @@ read small slices from large chunks.: >>> z = zarr.open_array(store=store) For example, if you have an array with large 1000x1000 chunks and frequently need to access small 100x100 sections, -memory mapping can provide efficient access by mapping only the needed portions into memory +memory mapping can provide efficient access by mapping only the needed portions into memory, rather than loading entire chunks.: >>> # Create an array with large chunks diff --git a/tests/test_store/test_mmap.py b/tests/test_store/test_mmap.py index 0f7a81bc45..1919e26f90 100644 --- a/tests/test_store/test_mmap.py +++ b/tests/test_store/test_mmap.py @@ -1,8 +1,8 @@ from __future__ import annotations +import mmap from typing import TYPE_CHECKING -import mmap import pytest import zarr @@ -15,7 +15,7 @@ class MemoryMappedDirectoryStore(LocalStore): - def _fromfile(self, fn): + def _fromfile(self, fn: str) -> memoryview: with open(fn, "rb") as fh: return memoryview(mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ)) @@ -64,13 +64,11 @@ def test_creates_new_directory(self, tmp_path: pathlib.Path): async def test_mmap_slice_reads(self, store: MemoryMappedDirectoryStore) -> None: """Test reading slices with memory mapping""" # Create array with large chunks - z = zarr.create_array(store=store, shape=(2000, 2000), chunks=(1000, 1000), - dtype='float64') + z = zarr.create_array(store=store, shape=(2000, 2000), chunks=(1000, 1000), dtype="float64") # Write test data - data = zarr.full(shape=(2000, 2000), chunks=(1000, 1000), fill_value=42.0, - dtype='float64') + data = zarr.full(shape=(2000, 2000), chunks=(1000, 1000), fill_value=42.0, dtype="float64") z[:] = data[:] - + # Test reading various slices slices = [ # Within single chunk @@ -78,8 +76,8 @@ async def test_mmap_slice_reads(self, store: MemoryMappedDirectoryStore) -> None # Across chunk boundaries (slice(900, 1100), slice(900, 1100)), # Full chunk - (slice(0, 1000), slice(0, 1000)) + (slice(0, 1000), slice(0, 1000)), ] - + for test_slice in slices: assert (z[test_slice] == data[test_slice]).all() From aa01fb6f9e088c17250a5b90b902a6bae4f334b6 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 20 Jan 2025 14:44:01 -0600 Subject: [PATCH 3/3] DOC: fix trailing whitespace in memory-mapping example --- docs/user-guide/storage.rst | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index bb7f9b9fe0..793a10f515 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -114,21 +114,18 @@ read small slices from large chunks.: ... with open(fn, "rb") as fh: ... return memoryview(mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ)) >>> - >>> # Create a memory-mapped store - >>> store = MemoryMappedDirectoryStore('data/example.zarr') - >>> z = zarr.open_array(store=store) - -For example, if you have an array with large 1000x1000 chunks and frequently need to access small 100x100 sections, -memory mapping can provide efficient access by mapping only the needed portions into memory, -rather than loading entire chunks.: - >>> # Create an array with large chunks >>> z = zarr.create_array('data/example.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='float64') - >>> # Later, open with memory mapping for efficient chunk access + >>> z[:] = 42 # Fill with test data + >>> + >>> # Open with memory mapping for efficient access >>> mmap_store = MemoryMappedDirectoryStore('data/example.zarr') >>> z = zarr.open_array(store=mmap_store) - >>> # Access specific chunks efficiently - >>> chunk_data = z[500:600, 500:600] # Only maps the needed chunks into memory + >>> + >>> # Access small slices efficiently + >>> chunk_data = z[500:600, 500:600] # Only maps the needed portion into memory + >>> chunk_data[0, 0] # Verify data + 42.0 .. _user-guide-custom-stores: