Skip to content

Commit 12ea814

Browse files
feat: Add mmap_mode parameter to NpyRef.load()
Support memory-mapped loading for large arrays: - Local filesystem stores: mmap directly, no download - Remote stores: download to cache, then mmap Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 6b951d4 commit 12ea814

File tree

2 files changed

+126
-9
lines changed

2 files changed

+126
-9
lines changed

src/datajoint/builtin_codecs.py

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -940,28 +940,77 @@ def is_loaded(self) -> bool:
940940
"""True if array data has been downloaded and cached."""
941941
return self._cached is not None
942942

943-
def load(self):
943+
def load(self, mmap_mode=None):
944944
"""
945945
Download and return the array.
946946
947+
Parameters
948+
----------
949+
mmap_mode : str, optional
950+
Memory-map mode for lazy, random-access loading of large arrays:
951+
952+
- ``'r'``: Read-only
953+
- ``'r+'``: Read-write
954+
- ``'c'``: Copy-on-write (changes not saved to disk)
955+
956+
If None (default), loads entire array into memory.
957+
947958
Returns
948959
-------
949-
numpy.ndarray
950-
The array data.
960+
numpy.ndarray or numpy.memmap
961+
The array data. Returns ``numpy.memmap`` if mmap_mode is specified.
951962
952963
Notes
953964
-----
954-
The array is cached after first load. Subsequent calls return
955-
the cached copy without additional I/O.
965+
When ``mmap_mode`` is None, the array is cached after first load.
966+
967+
For local filesystem stores, memory mapping accesses the file directly
968+
with no download. For remote stores (S3, etc.), the file is downloaded
969+
to a local cache (``{tempdir}/datajoint_mmap/``) before memory mapping.
970+
971+
Examples
972+
--------
973+
Standard loading::
974+
975+
arr = ref.load() # Loads entire array into memory
976+
977+
Memory-mapped for random access to large arrays::
978+
979+
arr = ref.load(mmap_mode='r')
980+
slice = arr[1000:2000] # Only reads the needed portion from disk
956981
"""
957982
import io
958983

959984
import numpy as np
960985

961-
if self._cached is None:
962-
buffer = self._backend.get_buffer(self.path)
963-
self._cached = np.load(io.BytesIO(buffer), allow_pickle=False)
964-
return self._cached
986+
if mmap_mode is None:
987+
# Standard loading with caching
988+
if self._cached is None:
989+
buffer = self._backend.get_buffer(self.path)
990+
self._cached = np.load(io.BytesIO(buffer), allow_pickle=False)
991+
return self._cached
992+
else:
993+
# Memory-mapped loading
994+
if self._backend.protocol == "file":
995+
# Local filesystem - mmap directly, no download needed
996+
local_path = self._backend._full_path(self.path)
997+
return np.load(local_path, mmap_mode=mmap_mode, allow_pickle=False)
998+
else:
999+
# Remote storage - download to local cache first
1000+
import hashlib
1001+
import tempfile
1002+
from pathlib import Path
1003+
1004+
path_hash = hashlib.md5(self.path.encode()).hexdigest()[:12]
1005+
cache_dir = Path(tempfile.gettempdir()) / "datajoint_mmap"
1006+
cache_dir.mkdir(exist_ok=True)
1007+
cache_path = cache_dir / f"{path_hash}.npy"
1008+
1009+
if not cache_path.exists():
1010+
buffer = self._backend.get_buffer(self.path)
1011+
cache_path.write_bytes(buffer)
1012+
1013+
return np.load(str(cache_path), mmap_mode=mmap_mode, allow_pickle=False)
9651014

9661015
def __array__(self, dtype=None):
9671016
"""

tests/integration/test_npy_codec.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,74 @@ class MockBackend:
139139
with pytest.raises(TypeError, match="0-dimensional"):
140140
len(ref)
141141

142+
def test_npy_ref_mmap_local_filesystem(self, tmp_path):
143+
"""NpyRef mmap_mode should work directly on local filesystem."""
144+
# Create a real .npy file
145+
test_array = np.arange(100, dtype=np.float64)
146+
npy_path = tmp_path / "test.npy"
147+
np.save(npy_path, test_array)
148+
149+
metadata = {
150+
"path": "test.npy",
151+
"store": None,
152+
"dtype": "float64",
153+
"shape": [100],
154+
}
155+
156+
# Mock backend that simulates local filesystem
157+
class MockFileBackend:
158+
protocol = "file"
159+
160+
def _full_path(self, path):
161+
return str(tmp_path / path)
162+
163+
def get_buffer(self, path):
164+
return (tmp_path / path).read_bytes()
165+
166+
ref = NpyRef(metadata, MockFileBackend())
167+
168+
# Load with mmap_mode
169+
mmap_arr = ref.load(mmap_mode="r")
170+
171+
# Should be a memmap
172+
assert isinstance(mmap_arr, np.memmap)
173+
np.testing.assert_array_equal(mmap_arr, test_array)
174+
175+
# Standard load should still work and cache
176+
regular_arr = ref.load()
177+
assert isinstance(regular_arr, np.ndarray)
178+
assert not isinstance(regular_arr, np.memmap)
179+
np.testing.assert_array_equal(regular_arr, test_array)
180+
181+
def test_npy_ref_mmap_remote_storage(self, tmp_path):
182+
"""NpyRef mmap_mode should download to cache for remote storage."""
183+
# Create test data
184+
test_array = np.array([1, 2, 3, 4, 5], dtype=np.int32)
185+
npy_buffer = np.save(tmp_path / "temp.npy", test_array)
186+
npy_bytes = (tmp_path / "temp.npy").read_bytes()
187+
188+
metadata = {
189+
"path": "remote/path/data.npy",
190+
"store": "s3-store",
191+
"dtype": "int32",
192+
"shape": [5],
193+
}
194+
195+
# Mock backend that simulates remote storage
196+
class MockS3Backend:
197+
protocol = "s3"
198+
199+
def get_buffer(self, path):
200+
return npy_bytes
201+
202+
ref = NpyRef(metadata, MockS3Backend())
203+
204+
# Load with mmap_mode - should download to cache
205+
mmap_arr = ref.load(mmap_mode="r")
206+
207+
assert isinstance(mmap_arr, np.memmap)
208+
np.testing.assert_array_equal(mmap_arr, test_array)
209+
142210

143211
class TestNpyCodecUnit:
144212
"""Unit tests for NpyCodec without database."""

0 commit comments

Comments
 (0)