Skip to content

Commit f9c8c09

Browse files
committed
move cache store to experimental, fix bugs
1 parent 1f200ed commit f9c8c09

File tree

4 files changed

+493
-19
lines changed

4 files changed

+493
-19
lines changed

docs/user-guide/cachingstore.md

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
# CacheStore guide
2+
3+
The `zarr.storage.CacheStore` provides a dual-store caching implementation
4+
that can be wrapped around any Zarr store to improve performance for repeated data access.
5+
This is particularly useful when working with remote stores (e.g., S3, HTTP) where network
6+
latency can significantly impact data access speed.
7+
8+
The CacheStore implements a cache that uses a separate Store instance as the cache backend,
9+
providing persistent caching capabilities with time-based expiration, size-based eviction,
10+
and flexible cache storage options. It automatically evicts the least recently used items
11+
when the cache reaches its maximum size.
12+
13+
> **Note:** The CacheStore is a wrapper store that maintains compatibility with the full
14+
> `zarr.abc.store.Store` API while adding transparent caching functionality.
15+
16+
## Basic Usage
17+
18+
Creating a CacheStore requires both a source store and a cache store. The cache store
19+
can be any Store implementation, providing flexibility in cache persistence:
20+
21+
```python
22+
import zarr
23+
import zarr.storage
24+
import numpy as np
25+
26+
# Create a local store and a separate cache store
27+
source_store = zarr.storage.LocalStore('test.zarr')
28+
cache_store = zarr.storage.MemoryStore() # In-memory cache
29+
cached_store = zarr.storage.CacheStore(
30+
store=source_store,
31+
cache_store=cache_store,
32+
max_size=256*1024*1024 # 256MB cache
33+
)
34+
35+
# Create an array using the cached store
36+
zarr_array = zarr.zeros((100, 100), chunks=(10, 10), dtype='f8', store=cached_store, mode='w')
37+
38+
# Write some data to force chunk creation
39+
zarr_array[:] = np.random.random((100, 100))
40+
```
41+
42+
The dual-store architecture allows you to use different store types for source and cache,
43+
such as a remote store for source data and a local store for persistent caching.
44+
45+
## Performance Benefits
46+
47+
The CacheStore provides significant performance improvements for repeated data access:
48+
49+
```python
50+
import time
51+
52+
# Benchmark reading with cache
53+
start = time.time()
54+
for _ in range(100):
55+
_ = zarr_array[:]
56+
elapsed_cache = time.time() - start
57+
58+
# Compare with direct store access (without cache)
59+
zarr_array_nocache = zarr.open('test.zarr', mode='r')
60+
start = time.time()
61+
for _ in range(100):
62+
_ = zarr_array_nocache[:]
63+
elapsed_nocache = time.time() - start
64+
65+
# Cache provides speedup for repeated access
66+
speedup = elapsed_nocache / elapsed_cache
67+
```
68+
69+
Cache effectiveness is particularly pronounced with repeated access to the same data chunks.
70+
71+
## Remote Store Caching
72+
73+
The CacheStore is most beneficial when used with remote stores where network latency
74+
is a significant factor. You can use different store types for source and cache:
75+
76+
```python
77+
from zarr.storage import FsspecStore, LocalStore
78+
79+
# Create a remote store (S3 example) - for demonstration only
80+
remote_store = FsspecStore.from_url('s3://bucket/data.zarr', storage_options={'anon': True})
81+
82+
# Use a local store for persistent caching
83+
local_cache_store = LocalStore('cache_data')
84+
85+
# Create cached store with persistent local cache
86+
cached_store = zarr.storage.CacheStore(
87+
store=remote_store,
88+
cache_store=local_cache_store,
89+
max_size=512*1024*1024 # 512MB cache
90+
)
91+
92+
# Open array through cached store
93+
z = zarr.open(cached_store)
94+
```
95+
96+
The first access to any chunk will be slow (network retrieval), but subsequent accesses
97+
to the same chunk will be served from the local cache, providing dramatic speedup.
98+
The cache persists between sessions when using a LocalStore for the cache backend.
99+
100+
## Cache Configuration
101+
102+
The CacheStore can be configured with several parameters:
103+
104+
**max_size**: Controls the maximum size of cached data in bytes
105+
106+
```python
107+
# 256MB cache with size limit
108+
cache = zarr.storage.CacheStore(
109+
store=source_store,
110+
cache_store=cache_store,
111+
max_size=256*1024*1024
112+
)
113+
114+
# Unlimited cache size (use with caution)
115+
cache = zarr.storage.CacheStore(
116+
store=source_store,
117+
cache_store=cache_store,
118+
max_size=None
119+
)
120+
```
121+
122+
**max_age_seconds**: Controls time-based cache expiration
123+
124+
```python
125+
# Cache expires after 1 hour
126+
cache = zarr.storage.CacheStore(
127+
store=source_store,
128+
cache_store=cache_store,
129+
max_age_seconds=3600
130+
)
131+
132+
# Cache never expires
133+
cache = zarr.storage.CacheStore(
134+
store=source_store,
135+
cache_store=cache_store,
136+
max_age_seconds="infinity"
137+
)
138+
```
139+
140+
**cache_set_data**: Controls whether written data is cached
141+
142+
```python
143+
# Cache data when writing (default)
144+
cache = zarr.storage.CacheStore(
145+
store=source_store,
146+
cache_store=cache_store,
147+
cache_set_data=True
148+
)
149+
150+
# Don't cache written data (read-only cache)
151+
cache = zarr.storage.CacheStore(
152+
store=source_store,
153+
cache_store=cache_store,
154+
cache_set_data=False
155+
)
156+
```
157+
158+
## Cache Statistics
159+
160+
The CacheStore provides statistics to monitor cache performance and state:
161+
162+
```python
163+
# Access some data to generate cache activity
164+
data = zarr_array[0:50, 0:50] # First access - cache miss
165+
data = zarr_array[0:50, 0:50] # Second access - cache hit
166+
167+
# Get comprehensive cache information
168+
info = cached_store.cache_info()
169+
print(info['cache_store_type']) # e.g., 'MemoryStore'
170+
print(info['max_age_seconds'])
171+
print(info['max_size'])
172+
print(info['current_size'])
173+
print(info['tracked_keys'])
174+
print(info['cached_keys'])
175+
print(info['cache_set_data'])
176+
```
177+
178+
The `cache_info()` method returns a dictionary with detailed information about the cache state.
179+
180+
## Cache Management
181+
182+
The CacheStore provides methods for manual cache management:
183+
184+
```python
185+
# Clear all cached data and tracking information
186+
import asyncio
187+
asyncio.run(cached_store.clear_cache())
188+
189+
# Check cache info after clearing
190+
info = cached_store.cache_info()
191+
assert info['tracked_keys'] == 0
192+
assert info['current_size'] == 0
193+
```
194+
195+
The `clear_cache()` method is an async method that clears both the cache store
196+
(if it supports the `clear` method) and all internal tracking data.
197+
198+
## Best Practices
199+
200+
1. **Choose appropriate cache store**: Use MemoryStore for fast temporary caching or LocalStore for persistent caching
201+
2. **Size the cache appropriately**: Set `max_size` based on available storage and expected data access patterns
202+
3. **Use with remote stores**: The cache provides the most benefit when wrapping slow remote stores
203+
4. **Monitor cache statistics**: Use `cache_info()` to tune cache size and access patterns
204+
5. **Consider data locality**: Group related data accesses together to improve cache efficiency
205+
6. **Set appropriate expiration**: Use `max_age_seconds` for time-sensitive data or "infinity" for static data
206+
207+
## Working with Different Store Types
208+
209+
The CacheStore can wrap any store that implements the `zarr.abc.store.Store` interface
210+
and use any store type for the cache backend:
211+
212+
### Local Store with Memory Cache
213+
214+
```python
215+
from zarr.storage import LocalStore, MemoryStore
216+
source_store = LocalStore('data.zarr')
217+
cache_store = MemoryStore()
218+
cached_store = zarr.storage.CacheStore(
219+
store=source_store,
220+
cache_store=cache_store,
221+
max_size=128*1024*1024
222+
)
223+
```
224+
225+
### Remote Store with Local Cache
226+
227+
```python
228+
from zarr.storage import FsspecStore, LocalStore
229+
remote_store = FsspecStore.from_url('s3://bucket/data.zarr', storage_options={'anon': True})
230+
local_cache = LocalStore('local_cache')
231+
cached_store = zarr.storage.CacheStore(
232+
store=remote_store,
233+
cache_store=local_cache,
234+
max_size=1024*1024*1024,
235+
max_age_seconds=3600
236+
)
237+
```
238+
239+
### Memory Store with Persistent Cache
240+
241+
```python
242+
from zarr.storage import MemoryStore, LocalStore
243+
memory_store = MemoryStore()
244+
persistent_cache = LocalStore('persistent_cache')
245+
cached_store = zarr.storage.CacheStore(
246+
store=memory_store,
247+
cache_store=persistent_cache,
248+
max_size=256*1024*1024
249+
)
250+
```
251+
252+
The dual-store architecture provides flexibility in choosing the best combination
253+
of source and cache stores for your specific use case.
254+
255+
## Examples from Real Usage
256+
257+
Here's a complete example demonstrating cache effectiveness:
258+
259+
```python
260+
import zarr
261+
import zarr.storage
262+
import time
263+
import numpy as np
264+
265+
# Create test data with dual-store cache
266+
source_store = zarr.storage.LocalStore('benchmark.zarr')
267+
cache_store = zarr.storage.MemoryStore()
268+
cached_store = zarr.storage.CacheStore(
269+
store=source_store,
270+
cache_store=cache_store,
271+
max_size=256*1024*1024
272+
)
273+
zarr_array = zarr.zeros((100, 100), chunks=(10, 10), dtype='f8', store=cached_store, mode='w')
274+
zarr_array[:] = np.random.random((100, 100))
275+
276+
# Demonstrate cache effectiveness with repeated access
277+
start = time.time()
278+
data = zarr_array[20:30, 20:30] # First access (cache miss)
279+
first_access = time.time() - start
280+
281+
start = time.time()
282+
data = zarr_array[20:30, 20:30] # Second access (cache hit)
283+
second_access = time.time() - start
284+
285+
# Check cache statistics
286+
info = cached_store.cache_info()
287+
assert info['cached_keys'] > 0 # Should have cached keys
288+
assert info['current_size'] > 0 # Should have cached data
289+
```
290+
291+
This example shows how the CacheStore can significantly reduce access times for repeated
292+
data reads, particularly important when working with remote data sources. The dual-store
293+
architecture allows for flexible cache persistence and management.

src/zarr/storage/_caching_store.py renamed to src/zarr/experimental/cache_store.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
if TYPE_CHECKING:
1515
from zarr.core.buffer.core import Buffer, BufferPrototype
1616

17-
if TYPE_CHECKING:
18-
from zarr.core.buffer.core import Buffer, BufferPrototype
19-
2017

2118
def buffer_size(v: Any) -> int:
2219
"""Calculate the size in bytes of a value, handling Buffer objects properly."""
@@ -123,7 +120,7 @@ def _get_cache_size(self, key: str) -> int:
123120
# For now, we'll estimate by getting the data when we cache it
124121
return 0 # Will be properly set when caching
125122

126-
def _accommodate_value(self, value_size: int) -> None:
123+
async def _accommodate_value(self, value_size: int) -> None:
127124
"""Ensure there is enough space in the cache for a new value."""
128125
if self.max_size is None:
129126
return
@@ -132,9 +129,9 @@ def _accommodate_value(self, value_size: int) -> None:
132129
while self._current_size + value_size > self.max_size and self._cache_order:
133130
# Get the least recently used key (first in OrderedDict)
134131
lru_key = next(iter(self._cache_order))
135-
self._evict_key(lru_key)
132+
await self._evict_key(lru_key)
136133

137-
def _evict_key(self, key: str) -> None:
134+
async def _evict_key(self, key: str) -> None:
138135
"""Remove a key from cache and update size tracking."""
139136
try:
140137
# Get the size of the key being evicted
@@ -150,11 +147,15 @@ def _evict_key(self, key: str) -> None:
150147

151148
# Update current size
152149
self._current_size = max(0, self._current_size - key_size)
150+
151+
# Actually delete from cache store
152+
await self._cache.delete(key)
153+
153154
logger.info("_evict_key: evicted key %s from cache, size %d", key, key_size)
154155
except Exception as e:
155156
logger.warning("_evict_key: failed to evict key %s: %s", key, e)
156157

157-
def _cache_value(self, key: str, value: Any) -> None:
158+
async def _cache_value(self, key: str, value: Any) -> None:
158159
"""Cache a value with size tracking."""
159160
value_size = buffer_size(value)
160161

@@ -167,8 +168,14 @@ def _cache_value(self, key: str, value: Any) -> None:
167168
)
168169
return
169170

171+
# If key already exists, subtract old size first (Bug fix #3)
172+
if key in self._key_sizes:
173+
old_size = self._key_sizes[key]
174+
self._current_size -= old_size
175+
logger.info("_cache_value: updating existing key %s, old size %d", key, old_size)
176+
170177
# Make room for the new value
171-
self._accommodate_value(value_size)
178+
await self._accommodate_value(value_size)
172179

173180
# Update tracking
174181
self._cache_order[key] = None # OrderedDict to track access order
@@ -221,7 +228,7 @@ async def _get_try_cache(
221228
self._remove_from_tracking(key)
222229
else:
223230
await self._cache.set(key, maybe_fresh_result)
224-
self._cache_value(key, maybe_fresh_result)
231+
await self._cache_value(key, maybe_fresh_result)
225232
return maybe_fresh_result
226233

227234
async def _get_no_cache(
@@ -236,7 +243,7 @@ async def _get_no_cache(
236243
else:
237244
logger.info("_get_no_cache: key %s found in store, setting in cache", key)
238245
await self._cache.set(key, maybe_fresh_result)
239-
self._cache_value(key, maybe_fresh_result)
246+
await self._cache_value(key, maybe_fresh_result)
240247
return maybe_fresh_result
241248

242249
async def get(
@@ -285,7 +292,7 @@ async def set(self, key: str, value: Buffer) -> None:
285292
if self.cache_set_data:
286293
logger.info("set: setting key %s in cache", key)
287294
await self._cache.set(key, value)
288-
self._cache_value(key, value)
295+
await self._cache_value(key, value)
289296
else:
290297
logger.info("set: deleting key %s from cache", key)
291298
await self._cache.delete(key)

0 commit comments

Comments
 (0)