Skip to content

Commit 78e0d1d

Browse files
committed
Remove serializes flag; longblob is now raw bytes
Simplified design: - Plain longblob columns store/return raw bytes (no serialization) - <djblob> type handles serialization via encode/decode - Legacy AttributeAdapter handles blob pack/unpack internally for backward compatibility This eliminates the need for the serializes flag by making blob serialization the responsibility of the adapter/type, not the framework. Migration to <djblob> is now required for existing schemas that rely on implicit serialization.
1 parent 61db015 commit 78e0d1d

File tree

6 files changed

+85
-38
lines changed

6 files changed

+85
-38
lines changed

docs/src/design/tables/customtype.md

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -498,22 +498,42 @@ class ProcessedData(dj.Manual):
498498
definition = """
499499
data_id : int
500500
---
501-
results : <djblob> # Explicit serialization
502-
raw_bytes : longblob # Backward-compatible (auto-serialized)
501+
results : <djblob> # Serialized Python objects
502+
raw_bytes : longblob # Raw bytes (no serialization)
503503
"""
504504
```
505505

506506
#### When to Use `<djblob>`
507507

508-
- **New tables**: Prefer `<djblob>` for clarity and future-proofing
509-
- **Custom types**: Use `<djblob>` when your type chains to blob storage
510-
- **Migration**: Existing `longblob` columns can be migrated to `<djblob>`
508+
- **Serialized data**: When storing Python objects (dicts, arrays, etc.)
509+
- **New tables**: Prefer `<djblob>` for automatic serialization
510+
- **Migration**: Existing schemas with implicit serialization must migrate
511511

512-
#### Backward Compatibility
512+
#### Raw Blob Behavior
513513

514-
For backward compatibility, `longblob` columns without an explicit type
515-
still receive automatic serialization. The behavior is identical to `<djblob>`,
516-
but using `<djblob>` makes the serialization explicit in your code.
514+
Plain `longblob` (and other blob variants) columns now store and return
515+
**raw bytes** without automatic serialization:
516+
517+
```python
518+
@schema
519+
class RawData(dj.Manual):
520+
definition = """
521+
id : int
522+
---
523+
raw_bytes : longblob # Stores/returns raw bytes
524+
serialized : <djblob> # Stores Python objects with serialization
525+
"""
526+
527+
# Raw bytes - no serialization
528+
RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}})
529+
530+
row = (RawData & "id=1").fetch1()
531+
row["raw_bytes"] # Returns: b"raw binary data"
532+
row["serialized"] # Returns: {"key": "value"}
533+
```
534+
535+
**Important**: Existing schemas that relied on implicit blob serialization
536+
must be migrated to `<djblob>` to preserve their behavior.
517537

518538
## Schema Migration
519539

src/datajoint/attribute_adapter.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
from .attribute_type import AttributeType, get_type, is_type_registered
1616
from .errors import DataJointError
1717

18+
# Pattern to detect blob types for internal pack/unpack
19+
_BLOB_PATTERN = re.compile(r"^(tiny|small|medium|long|)blob", re.I)
20+
1821

1922
class AttributeAdapter(AttributeType):
2023
"""
@@ -87,12 +90,37 @@ def dtype(self) -> str:
8790
)
8891
return attr_type
8992

93+
def _is_blob_dtype(self) -> bool:
94+
"""Check if dtype is a blob type requiring pack/unpack."""
95+
return bool(_BLOB_PATTERN.match(self.dtype))
96+
9097
def encode(self, value: Any, *, key: dict | None = None) -> Any:
91-
"""Delegate to legacy put() method."""
92-
return self.put(value)
98+
"""
99+
Delegate to legacy put() method, with blob packing if needed.
100+
101+
Legacy adapters expect blob.pack to be called after put() when
102+
the dtype is a blob type. This wrapper handles that automatically.
103+
"""
104+
result = self.put(value)
105+
# Legacy adapters expect blob.pack after put() for blob dtypes
106+
if self._is_blob_dtype():
107+
from . import blob
108+
109+
result = blob.pack(result)
110+
return result
93111

94112
def decode(self, stored: Any, *, key: dict | None = None) -> Any:
95-
"""Delegate to legacy get() method."""
113+
"""
114+
Delegate to legacy get() method, with blob unpacking if needed.
115+
116+
Legacy adapters expect blob.unpack to be called before get() when
117+
the dtype is a blob type. This wrapper handles that automatically.
118+
"""
119+
# Legacy adapters expect blob.unpack before get() for blob dtypes
120+
if self._is_blob_dtype():
121+
from . import blob
122+
123+
stored = blob.unpack(stored)
96124
return self.get(stored)
97125

98126
def put(self, obj: Any) -> Any:

src/datajoint/attribute_type.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,6 @@ def decode(self, stored: Any, *, key: dict | None = None) -> Any:
153153
"""
154154
...
155155

156-
# Class attribute: If True, encode() produces final binary data (no blob.pack needed)
157-
# Override in subclasses that handle their own serialization
158-
serializes: bool = False
159-
160156
def validate(self, value: Any) -> None:
161157
"""
162158
Validate a value before encoding.
@@ -440,19 +436,19 @@ class ProcessedData(dj.Manual):
440436
definition = '''
441437
data_id : int
442438
---
443-
results : <djblob> # Explicit DataJoint serialization
439+
results : <djblob> # Serialized Python objects
444440
raw_bytes : longblob # Raw bytes (no serialization)
445441
'''
446442
447443
Note:
448-
For backward compatibility, ``longblob`` columns without an explicit type
449-
still use automatic serialization. Use ``<djblob>`` to be explicit about
450-
serialization behavior.
444+
Plain ``longblob`` columns store and return raw bytes without serialization.
445+
Use ``<djblob>`` when you need automatic serialization of Python objects.
446+
Existing schemas using implicit blob serialization should migrate to ``<djblob>``
447+
using ``dj.migrate.migrate_blob_columns()``.
451448
"""
452449

453450
type_name = "djblob"
454451
dtype = "longblob"
455-
serializes = True # This type handles its own serialization
456452

457453
def encode(self, value: Any, *, key: dict | None = None) -> bytes:
458454
"""
@@ -508,7 +504,6 @@ class LargeData(dj.Manual):
508504
# It's used internally when blob@store syntax is detected
509505
type_name = "djblob_external"
510506
dtype = "blob@store" # Placeholder - actual store is determined at declaration time
511-
serializes = True # This type handles its own serialization
512507

513508
def encode(self, value: Any, *, key: dict | None = None) -> bytes:
514509
"""Serialize a Python object to DataJoint's blob format."""

src/datajoint/fetch.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,11 @@ def adapt(x):
9292
return adapt(uuid.UUID(bytes=data))
9393
elif attr.is_blob:
9494
blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data
95-
# Skip unpack if adapter handles its own deserialization
96-
if attr.adapter and getattr(attr.adapter, "serializes", False):
95+
# Adapters (like <djblob>) handle deserialization in decode()
96+
# Without adapter, blob columns return raw bytes (no deserialization)
97+
if attr.adapter:
9798
return attr.adapter.decode(blob_data, key=None)
98-
return adapt(blob.unpack(blob_data, squeeze=squeeze))
99+
return blob_data # raw bytes
99100
else:
100101
return adapt(data)
101102

src/datajoint/table.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -742,9 +742,8 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False):
742742
raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name))
743743
value = value.bytes
744744
elif attr.is_blob:
745-
# Skip blob.pack if adapter already handles serialization
746-
if not (attr.adapter and getattr(attr.adapter, "serializes", False)):
747-
value = blob.pack(value)
745+
# Adapters (like <djblob>) handle serialization in encode()
746+
# Without adapter, blob columns store raw bytes (no serialization)
748747
if attr.is_external:
749748
value = self.external[attr.store].put(value).bytes
750749
elif attr.is_attachment:

tests/test_attribute_type.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,6 @@ def test_djblob_properties(self):
359359
blob_type = get_type("djblob")
360360
assert blob_type.type_name == "djblob"
361361
assert blob_type.dtype == "longblob"
362-
assert blob_type.serializes is True
363362

364363
def test_djblob_encode_decode_roundtrip(self):
365364
"""Test that encode/decode is a proper roundtrip."""
@@ -400,16 +399,21 @@ def test_djblob_in_list_types(self):
400399
types = list_types()
401400
assert "djblob" in types
402401

403-
def test_serializes_flag_prevents_double_pack(self):
404-
"""Test that serializes=True prevents blob.pack being called twice.
402+
def test_djblob_handles_serialization(self):
403+
"""Test that DJBlobType handles serialization internally.
405404
406-
This is a unit test for the flag itself. Integration test with tables
407-
is in test_blob.py or test_adapted_attributes.py.
405+
With the new design:
406+
- Plain longblob columns store/return raw bytes (no serialization)
407+
- <djblob> handles pack/unpack in encode/decode
408+
- Legacy AttributeAdapter handles pack/unpack internally for backward compat
408409
"""
409410
blob_type = get_type("djblob")
410-
assert blob_type.serializes is True
411411

412-
# Legacy adapters should not have serializes=True
413-
# (they rely on blob.pack being called after encode)
414-
# AttributeType base class defaults to False
415-
assert AttributeType.serializes is False
412+
# DJBlobType.encode() should produce packed bytes
413+
data = {"key": "value"}
414+
encoded = blob_type.encode(data)
415+
assert isinstance(encoded, bytes)
416+
417+
# DJBlobType.decode() should unpack back to original
418+
decoded = blob_type.decode(encoded)
419+
assert decoded == data

0 commit comments

Comments
 (0)