77
88Built-in Codecs:
99 - ``<blob>``: Serialize Python objects (internal) or external with dedup
10- - ``<hash>``: Hash-addressed storage with MD5 deduplication
11- - ``<object>``: Path -addressed storage for files/folders (Zarr, HDF5)
10+ - ``<hash>``: Hash-addressed storage with SHA256 deduplication
11+ - ``<object>``: Schema -addressed storage for files/folders (Zarr, HDF5)
1212 - ``<attach>``: File attachment (internal) or external with dedup
1313 - ``<filepath@store>``: Reference to existing file in store
1414 - ``<npy@>``: Store numpy arrays as portable .npy files (external only)
@@ -127,14 +127,16 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
127127
128128class HashCodec (Codec ):
129129 """
130- Hash-addressed storage with MD5 deduplication.
130+ Hash-addressed storage with SHA256 deduplication.
131131
132- The ``<hash@>`` codec stores raw bytes using content -addressed storage.
133- Data is identified by its MD5 hash and stored in a hierarchical directory:
132+ The ``<hash@>`` codec stores raw bytes using hash -addressed storage.
133+ Data is identified by its SHA256 hash and stored in a hierarchical directory:
134134 ``_hash/{hash[:2]}/{hash[2:4]}/{hash}``
135135
136136 The database column stores JSON metadata: ``{hash, store, size}``.
137- Duplicate content is automatically deduplicated.
137+ Duplicate content is automatically deduplicated across all tables.
138+
139+ Deletion: Requires garbage collection via ``dj.gc.collect()``.
138140
139141 External only - requires @ modifier.
140142
@@ -154,6 +156,10 @@ class RawContent(dj.Manual):
154156 Note:
155157 This codec accepts only ``bytes``. For Python objects, use ``<blob@>``.
156158 Typically used indirectly via ``<blob@>`` or ``<attach@>`` rather than directly.
159+
160+ See Also
161+ --------
162+ datajoint.gc : Garbage collection for orphaned storage.
157163 """
158164
159165 name = "hash"
@@ -173,38 +179,39 @@ def encode(self, value: bytes, *, key: dict | None = None, store_name: str | Non
173179 value : bytes
174180 Raw bytes to store.
175181 key : dict, optional
176- Primary key values (unused) .
182+ Context dict with ``_schema`` for path isolation .
177183 store_name : str, optional
178184 Store to use. If None, uses default store.
179185
180186 Returns
181187 -------
182188 dict
183- Metadata dict: ``{hash, store, size}``.
189+ Metadata dict: ``{hash, path, schema, store, size}``.
184190 """
185- from .content_registry import put_content
191+ from .hash_registry import put_hash
186192
187- return put_content (value , store_name = store_name )
193+ schema_name = (key or {}).get ("_schema" , "unknown" )
194+ return put_hash (value , schema_name = schema_name , store_name = store_name )
188195
189196 def decode (self , stored : dict , * , key : dict | None = None ) -> bytes :
190197 """
191- Retrieve content by hash .
198+ Retrieve content using stored metadata .
192199
193200 Parameters
194201 ----------
195202 stored : dict
196- Metadata dict with ``'hash'`` and optionally ``'store'``.
203+ Metadata dict with ``'path'``, ``' hash'``, and optionally ``'store'``.
197204 key : dict, optional
198- Primary key values (unused).
205+ Context dict (unused - path is in metadata ).
199206
200207 Returns
201208 -------
202209 bytes
203210 Original bytes.
204211 """
205- from .content_registry import get_content
212+ from .hash_registry import get_hash
206213
207- return get_content (stored [ "hash" ], store_name = stored . get ( "store" ) )
214+ return get_hash (stored )
208215
209216 def validate (self , value : Any ) -> None :
210217 """Validate that value is bytes."""
@@ -366,7 +373,7 @@ def _get_backend(self, store_name: str | None = None):
366373 StorageBackend
367374 Storage backend instance.
368375 """
369- from .content_registry import get_store_backend
376+ from .hash_registry import get_store_backend
370377
371378 return get_store_backend (store_name )
372379
@@ -384,8 +391,8 @@ class ObjectCodec(SchemaCodec):
384391 schema-addressed paths: ``{schema}/{table}/{pk}/{field}/``. This creates
385392 a browsable organization in object storage that mirrors the database schema.
386393
387- Unlike hash-addressed storage (``<hash@>``), each row has its own path
388- and content is deleted when the row is deleted . Ideal for:
394+ Unlike hash-addressed storage (``<hash@>``), each row has its own unique path
395+ (no deduplication) . Ideal for:
389396
390397 - Zarr arrays (hierarchical chunked data)
391398 - HDF5 files
@@ -419,17 +426,20 @@ def make(self, key):
419426
420427 {store_root}/{schema}/{table}/{pk}/{field}/
421428
429+ Deletion: Requires garbage collection via ``dj.gc.collect()``.
430+
422431 Comparison with hash-addressed::
423432
424433 | Aspect | <object@> | <hash@> |
425434 |----------------|---------------------|---------------------|
426435 | Addressing | Schema-addressed | Hash-addressed |
427436 | Deduplication | No | Yes |
428- | Deletion | With row | GC when unreferenced |
437+ | Deletion | GC required | GC required |
429438 | Use case | Zarr, HDF5 | Blobs, attachments |
430439
431440 See Also
432441 --------
442+ datajoint.gc : Garbage collection for orphaned storage.
433443 SchemaCodec : Base class for schema-addressed codecs.
434444 NpyCodec : Schema-addressed storage for numpy arrays.
435445 HashCodec : Hash-addressed storage with deduplication.
@@ -782,7 +792,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None
782792 """
783793 from datetime import datetime , timezone
784794
785- from .content_registry import get_store_backend
795+ from .hash_registry import get_store_backend
786796
787797 path = str (value )
788798
@@ -822,7 +832,7 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any:
822832 Handle for accessing the file.
823833 """
824834 from .objectref import ObjectRef
825- from .content_registry import get_store_backend
835+ from .hash_registry import get_store_backend
826836
827837 store_name = stored .get ("store" )
828838 backend = get_store_backend (store_name )
@@ -1103,8 +1113,11 @@ class Recording(dj.Manual):
11031113 - Path: ``{schema}/{table}/{pk}/{attribute}.npy``
11041114 - Database column: JSON with ``{path, store, dtype, shape}``
11051115
1116+ Deletion: Requires garbage collection via ``dj.gc.collect()``.
1117+
11061118 See Also
11071119 --------
1120+ datajoint.gc : Garbage collection for orphaned storage.
11081121 NpyRef : The lazy array reference returned on fetch.
11091122 SchemaCodec : Base class for schema-addressed codecs.
11101123 ObjectCodec : Schema-addressed storage for files/folders.
0 commit comments