Skip to content

Commit 79f60b2

Browse files
feat: Add configurable prefixes for storage sections
Allow users to configure custom prefixes for hash-addressed, schema-addressed, and filepath storage sections per store. This enables mapping DataJoint to existing storage layouts without restructuring. Configuration: - hash_prefix (default: '_hash') - Hash-addressed storage section - schema_prefix (default: '_schema') - Schema-addressed storage section - filepath_prefix (default: None) - Optional filepath restriction Features: - Validates prefixes don't overlap (mutual exclusion) - FilepathCodec enforces dynamic reserved prefixes - Optional filepath_prefix to restrict filepath paths - Backwards compatible defaults Examples: { "stores": { "legacy": { "protocol": "file", "location": "/data/existing", "hash_prefix": "content_addressed", "schema_prefix": "structured_data", "filepath_prefix": "raw_files" } } } Changes: - settings.py: Add prefix fields, validation logic - builtin_codecs.py: Dynamic prefix checking in FilepathCodec - test_settings.py: 7 new tests for prefix validation - test_codecs.py: 2 new tests for custom prefixes
1 parent fe29274 commit 79f60b2

File tree

4 files changed

+441
-73
lines changed

4 files changed

+441
-73
lines changed

src/datajoint/builtin_codecs.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -809,18 +809,45 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None
809809
"""
810810
from datetime import datetime, timezone
811811

812+
from . import config
812813
from .hash_registry import get_store_backend
813814

814815
path = str(value)
815816

816-
# Validate path doesn't use reserved sections
817-
path_normalized = path.lstrip('/')
818-
if path_normalized.startswith('_hash/') or path_normalized.startswith('_schema/'):
819-
raise ValueError(
820-
f"<filepath@> cannot use reserved sections '_hash/' or '_schema/'. "
821-
f"These sections are managed by DataJoint. "
822-
f"Got path: {path}"
823-
)
817+
# Get store spec to check prefix configuration
818+
spec = config.get_store_spec(store_name)
819+
820+
# Validate path doesn't use reserved sections (hash and schema)
821+
path_normalized = path.lstrip("/")
822+
reserved_prefixes = []
823+
824+
hash_prefix = spec.get("hash_prefix")
825+
if hash_prefix:
826+
reserved_prefixes.append(("hash_prefix", hash_prefix))
827+
828+
schema_prefix = spec.get("schema_prefix")
829+
if schema_prefix:
830+
reserved_prefixes.append(("schema_prefix", schema_prefix))
831+
832+
# Check if path starts with any reserved prefix
833+
for prefix_name, prefix_value in reserved_prefixes:
834+
prefix_normalized = prefix_value.strip("/") + "/"
835+
if path_normalized.startswith(prefix_normalized):
836+
raise ValueError(
837+
f"<filepath@> cannot use reserved section '{prefix_value}' ({prefix_name}). "
838+
f"This section is managed by DataJoint. "
839+
f"Got path: {path}"
840+
)
841+
842+
# If filepath_prefix is configured, enforce it
843+
filepath_prefix = spec.get("filepath_prefix")
844+
if filepath_prefix:
845+
filepath_prefix_normalized = filepath_prefix.strip("/") + "/"
846+
if not path_normalized.startswith(filepath_prefix_normalized):
847+
raise ValueError(
848+
f"<filepath@> must use prefix '{filepath_prefix}' (filepath_prefix). "
849+
f"Got path: {path}"
850+
)
824851

825852
# Verify file exists
826853
backend = get_store_backend(store_name)

src/datajoint/settings.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,11 @@ def get_store_spec(self, store: str | None = None) -> dict[str, Any]:
373373
spec.setdefault("partition_pattern", None) # No partitioning by default
374374
spec.setdefault("token_length", 8) # Default token length
375375

376+
# Set defaults for storage section prefixes
377+
spec.setdefault("hash_prefix", "_hash") # Hash-addressed storage section
378+
spec.setdefault("schema_prefix", "_schema") # Schema-addressed storage section
379+
spec.setdefault("filepath_prefix", None) # Filepath storage (unrestricted by default)
380+
376381
# Validate protocol
377382
protocol = spec.get("protocol", "").lower()
378383
supported_protocols = ("file", "s3", "gcs", "azure")
@@ -394,7 +399,17 @@ def get_store_spec(self, store: str | None = None) -> dict[str, Any]:
394399
"azure": ("protocol", "container", "location"),
395400
}
396401
allowed_keys: dict[str, tuple[str, ...]] = {
397-
"file": ("protocol", "location", "subfolding", "partition_pattern", "token_length", "stage"),
402+
"file": (
403+
"protocol",
404+
"location",
405+
"subfolding",
406+
"partition_pattern",
407+
"token_length",
408+
"hash_prefix",
409+
"schema_prefix",
410+
"filepath_prefix",
411+
"stage",
412+
),
398413
"s3": (
399414
"protocol",
400415
"endpoint",
@@ -406,6 +421,9 @@ def get_store_spec(self, store: str | None = None) -> dict[str, Any]:
406421
"subfolding",
407422
"partition_pattern",
408423
"token_length",
424+
"hash_prefix",
425+
"schema_prefix",
426+
"filepath_prefix",
409427
"stage",
410428
"proxy_server",
411429
),
@@ -418,6 +436,9 @@ def get_store_spec(self, store: str | None = None) -> dict[str, Any]:
418436
"subfolding",
419437
"partition_pattern",
420438
"token_length",
439+
"hash_prefix",
440+
"schema_prefix",
441+
"filepath_prefix",
421442
"stage",
422443
),
423444
"azure": (
@@ -430,6 +451,9 @@ def get_store_spec(self, store: str | None = None) -> dict[str, Any]:
430451
"subfolding",
431452
"partition_pattern",
432453
"token_length",
454+
"hash_prefix",
455+
"schema_prefix",
456+
"filepath_prefix",
433457
"stage",
434458
),
435459
}
@@ -444,8 +468,67 @@ def get_store_spec(self, store: str | None = None) -> dict[str, Any]:
444468
if invalid:
445469
raise DataJointError(f'Invalid key(s) in config.stores["{store}"]: {", ".join(invalid)}')
446470

471+
# Validate prefix separation to prevent overlap
472+
self._validate_prefix_separation(
473+
store_name=store,
474+
hash_prefix=spec.get("hash_prefix"),
475+
schema_prefix=spec.get("schema_prefix"),
476+
filepath_prefix=spec.get("filepath_prefix"),
477+
)
478+
447479
return spec
448480

481+
def _validate_prefix_separation(
482+
self,
483+
store_name: str,
484+
hash_prefix: str | None,
485+
schema_prefix: str | None,
486+
filepath_prefix: str | None,
487+
) -> None:
488+
"""
489+
Validate that storage section prefixes don't overlap.
490+
491+
Parameters
492+
----------
493+
store_name : str
494+
Name of the store being validated (for error messages).
495+
hash_prefix : str or None
496+
Prefix for hash-addressed storage.
497+
schema_prefix : str or None
498+
Prefix for schema-addressed storage.
499+
filepath_prefix : str or None
500+
Prefix for filepath storage (None means unrestricted).
501+
502+
Raises
503+
------
504+
DataJointError
505+
If any prefixes overlap (one is a parent/child of another).
506+
"""
507+
# Collect non-null prefixes with their names
508+
prefixes = []
509+
if hash_prefix:
510+
prefixes.append(("hash_prefix", hash_prefix))
511+
if schema_prefix:
512+
prefixes.append(("schema_prefix", schema_prefix))
513+
if filepath_prefix:
514+
prefixes.append(("filepath_prefix", filepath_prefix))
515+
516+
# Normalize prefixes: remove leading/trailing slashes, ensure trailing slash for comparison
517+
def normalize(p: str) -> str:
518+
return p.strip("/") + "/"
519+
520+
normalized = [(name, normalize(prefix)) for name, prefix in prefixes]
521+
522+
# Check each pair for overlap
523+
for i, (name1, p1) in enumerate(normalized):
524+
for j, (name2, p2) in enumerate(normalized[i + 1 :], start=i + 1):
525+
# Check if one prefix is a parent of another
526+
if p1.startswith(p2) or p2.startswith(p1):
527+
raise DataJointError(
528+
f'config.stores["{store_name}"]: {name1}="{prefixes[i][1]}" and '
529+
f'{name2}="{prefixes[j][1]}" overlap. '
530+
f"Storage section prefixes must be mutually exclusive."
531+
)
449532

450533
def load(self, filename: str | Path) -> None:
451534
"""

0 commit comments

Comments
 (0)