Skip to content

Commit bf62620

Browse files
Merge pull request #1333 from datajoint/feature/unified-stores-config
Unified stores configuration with configurable prefixes and filepath_default
2 parents 3f1d072 + 63ecba9 commit bf62620

File tree

13 files changed

+1556
-321
lines changed

13 files changed

+1556
-321
lines changed

integration_test_summary.txt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
## Integration Test Results - FINAL
2+
3+
**Test Summary:**
4+
- ✅ 520 tests PASSED
5+
- ⏭️ 7 tests SKIPPED
6+
- ❌ 0 tests FAILED
7+
8+
**All tests passing!**
9+
10+
### Initial Issues Found and Fixed
11+
12+
Initial run had 24 failures in object storage tests due to test fixture bug:
13+
- `conftest.py`: object_storage_config wasn't creating the `test_project` subdirectory
14+
- `test_update1.py`: mock_stores_update wasn't creating `djtest` subdirectories
15+
16+
**Root cause:** Test fixtures were configuring storage locations but not creating
17+
the directories. StorageBackend validates that file protocol locations exist
18+
during initialization.
19+
20+
**Fix:** Added `Path(location).mkdir(parents=True, exist_ok=True)` in both fixtures.
21+
22+
### Test Coverage Verified
23+
24+
All unified stores configuration functionality tested:
25+
- ✅ Configuration system with stores.default and stores.filepath_default
26+
- ✅ Prefix validation and separation (hash_prefix, schema_prefix, filepath_prefix)
27+
- ✅ Filepath codec validation with dynamic prefix checking
28+
- ✅ Store backend initialization and validation
29+
- ✅ Object storage (file, stream, folder operations)
30+
- ✅ Hash-addressed storage (blob, attach)
31+
- ✅ Schema-addressed storage (object, npy)
32+
- ✅ All relational operators and queries
33+
- ✅ Schema management and dependencies

src/datajoint/builtin_codecs.py

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,13 +323,16 @@ def _build_path(
323323
field: str,
324324
primary_key: dict,
325325
ext: str | None = None,
326+
store_name: str | None = None,
326327
) -> tuple[str, str]:
327328
"""
328329
Build schema-addressed storage path.
329330
330331
Constructs a path that mirrors the database schema structure:
331332
``{schema}/{table}/{pk_values}/{field}{ext}``
332333
334+
Supports partitioning if configured in the store.
335+
333336
Parameters
334337
----------
335338
schema : str
@@ -342,6 +345,8 @@ def _build_path(
342345
Primary key values.
343346
ext : str, optional
344347
File extension (e.g., ".npy", ".zarr").
348+
store_name : str, optional
349+
Store name for retrieving partition configuration.
345350
346351
Returns
347352
-------
@@ -350,13 +355,21 @@ def _build_path(
350355
is a unique identifier.
351356
"""
352357
from .storage import build_object_path
358+
from . import config
359+
360+
# Get store configuration for partition_pattern and token_length
361+
spec = config.get_store_spec(store_name)
362+
partition_pattern = spec.get("partition_pattern")
363+
token_length = spec.get("token_length", 8)
353364

354365
return build_object_path(
355366
schema=schema,
356367
table=table,
357368
field=field,
358369
primary_key=primary_key,
359370
ext=ext,
371+
partition_pattern=partition_pattern,
372+
token_length=token_length,
360373
)
361374

362375
def _get_backend(self, store_name: str | None = None):
@@ -518,7 +531,7 @@ def encode(
518531
raise TypeError(f"<object> expects bytes or path, got {type(value).__name__}")
519532

520533
# Build storage path using inherited helper
521-
path, token = self._build_path(schema, table, field, primary_key, ext=ext)
534+
path, token = self._build_path(schema, table, field, primary_key, ext=ext, store_name=store_name)
522535

523536
# Get storage backend using inherited helper
524537
backend = self._get_backend(store_name)
@@ -733,10 +746,16 @@ class FilepathCodec(Codec):
733746
734747
External only - requires @store.
735748
749+
This codec gives users maximum freedom in organizing their files while
750+
reusing DataJoint's store configuration. Files can be placed anywhere
751+
in the store EXCEPT the reserved ``_hash/`` and ``_schema/`` sections
752+
which are managed by DataJoint.
753+
736754
This is useful when:
737755
- Files are managed externally (e.g., by acquisition software)
738756
- Files are too large to copy
739757
- You want to reference shared datasets
758+
- You need custom directory structures
740759
741760
Example::
742761
@@ -749,6 +768,7 @@ class Recordings(dj.Manual):
749768
'''
750769
751770
# Reference an existing file (no copy)
771+
# Path is relative to store location
752772
table.insert1({'recording_id': 1, 'raw_data': 'subject01/session001/data.bin'})
753773
754774
# Fetch returns ObjectRef for lazy access
@@ -757,7 +777,10 @@ class Recordings(dj.Manual):
757777
ref.download() # Download to local path
758778
759779
Storage Format:
760-
JSON metadata: ``{path, store}``
780+
JSON metadata: ``{path, store, size, timestamp}``
781+
782+
Reserved Sections:
783+
Paths cannot start with ``_hash/`` or ``_schema/`` - these are managed by DataJoint.
761784
762785
Warning:
763786
The file must exist in the store at the specified path.
@@ -769,7 +792,9 @@ class Recordings(dj.Manual):
769792
def get_dtype(self, is_store: bool) -> str:
770793
"""Filepath is external only."""
771794
if not is_store:
772-
raise DataJointError("<filepath> requires @store")
795+
raise DataJointError(
796+
"<filepath> requires @ symbol. Use <filepath@> for default store " "or <filepath@store> to specify store."
797+
)
773798
return "json"
774799

775800
def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> dict:
@@ -779,7 +804,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None
779804
Parameters
780805
----------
781806
value : str
782-
Relative path within the store.
807+
Relative path within the store. Cannot use reserved sections (_hash/, _schema/).
783808
key : dict, optional
784809
Primary key values (unused).
785810
store_name : str, optional
@@ -789,14 +814,55 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None
789814
-------
790815
dict
791816
Metadata dict: ``{path, store}``.
817+
818+
Raises
819+
------
820+
ValueError
821+
If path uses reserved sections (_hash/ or _schema/).
822+
FileNotFoundError
823+
If file does not exist in the store.
792824
"""
793825
from datetime import datetime, timezone
794826

827+
from . import config
795828
from .hash_registry import get_store_backend
796829

797830
path = str(value)
798831

799-
# Optionally verify file exists
832+
# Get store spec to check prefix configuration
833+
# Use filepath_default if no store specified (filepath is not part of OAS)
834+
spec = config.get_store_spec(store_name, use_filepath_default=True)
835+
836+
# Validate path doesn't use reserved sections (hash and schema)
837+
path_normalized = path.lstrip("/")
838+
reserved_prefixes = []
839+
840+
hash_prefix = spec.get("hash_prefix")
841+
if hash_prefix:
842+
reserved_prefixes.append(("hash_prefix", hash_prefix))
843+
844+
schema_prefix = spec.get("schema_prefix")
845+
if schema_prefix:
846+
reserved_prefixes.append(("schema_prefix", schema_prefix))
847+
848+
# Check if path starts with any reserved prefix
849+
for prefix_name, prefix_value in reserved_prefixes:
850+
prefix_normalized = prefix_value.strip("/") + "/"
851+
if path_normalized.startswith(prefix_normalized):
852+
raise ValueError(
853+
f"<filepath@> cannot use reserved section '{prefix_value}' ({prefix_name}). "
854+
f"This section is managed by DataJoint. "
855+
f"Got path: {path}"
856+
)
857+
858+
# If filepath_prefix is configured, enforce it
859+
filepath_prefix = spec.get("filepath_prefix")
860+
if filepath_prefix:
861+
filepath_prefix_normalized = filepath_prefix.strip("/") + "/"
862+
if not path_normalized.startswith(filepath_prefix_normalized):
863+
raise ValueError(f"<filepath@> must use prefix '{filepath_prefix}' (filepath_prefix). " f"Got path: {path}")
864+
865+
# Verify file exists
800866
backend = get_store_backend(store_name)
801867
if not backend.exists(path):
802868
raise FileNotFoundError(f"File not found in store '{store_name or 'default'}': {path}")
@@ -1179,7 +1245,7 @@ def encode(
11791245
schema, table, field, primary_key = self._extract_context(key)
11801246

11811247
# Build schema-addressed storage path
1182-
path, _ = self._build_path(schema, table, field, primary_key, ext=".npy")
1248+
path, _ = self._build_path(schema, table, field, primary_key, ext=".npy", store_name=store_name)
11831249

11841250
# Serialize to .npy format
11851251
buffer = io.BytesIO()

src/datajoint/hash_registry.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,20 +138,15 @@ def get_store_backend(store_name: str | None = None) -> StorageBackend:
138138
Parameters
139139
----------
140140
store_name : str, optional
141-
Name of the store to use. If None, uses the default object storage
142-
configuration or the configured default_store.
141+
Name of the store to use. If None, uses stores.default.
143142
144143
Returns
145144
-------
146145
StorageBackend
147146
StorageBackend instance.
148147
"""
149-
# If store_name is None, check for configured default_store
150-
if store_name is None and config.object_storage.default_store:
151-
store_name = config.object_storage.default_store
152-
153-
# get_object_store_spec handles None by returning default object_storage config
154-
spec = config.get_object_store_spec(store_name)
148+
# get_store_spec handles None by using stores.default
149+
spec = config.get_store_spec(store_name)
155150
return StorageBackend(spec)
156151

157152

@@ -162,14 +157,14 @@ def get_store_subfolding(store_name: str | None = None) -> tuple[int, ...] | Non
162157
Parameters
163158
----------
164159
store_name : str, optional
165-
Name of the store. If None, uses default store.
160+
Name of the store. If None, uses stores.default.
166161
167162
Returns
168163
-------
169164
tuple[int, ...] | None
170165
Subfolding pattern (e.g., (2, 2)) or None for flat storage.
171166
"""
172-
spec = config.get_object_store_spec(store_name)
167+
spec = config.get_store_spec(store_name)
173168
subfolding = spec.get("subfolding")
174169
if subfolding is not None:
175170
return tuple(subfolding)

0 commit comments

Comments
 (0)