Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 3 additions & 21 deletions paimon-python/pypaimon/read/reader/data_file_batch_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pypaimon.read.reader.format_blob_reader import FormatBlobReader
from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
from pypaimon.schema.data_types import DataField, PyarrowFieldParser
from pypaimon.table.row.blob import Blob, BlobDescriptor
from pypaimon.table.row.blob import Blob
from pypaimon.table.special_fields import SpecialFields


Expand Down Expand Up @@ -178,28 +178,10 @@ def _blob_cell_to_data(self, value):
value = self._normalize_blob_cell(value)
if value is None:
return None

if not isinstance(value, bytes):
return value

descriptor = self._deserialize_descriptor_or_none(value)
if descriptor is None:
return value

try:
uri_reader = self.file_io.uri_reader_factory.create(descriptor.uri)
blob = Blob.from_descriptor(uri_reader, descriptor)
return blob.to_data()
except Exception as e:
raise RuntimeError(
"Failed to read blob bytes from descriptor URI while converting blob value."
) from e

@staticmethod
def _deserialize_descriptor_or_none(raw: bytes):
if not BlobDescriptor.is_blob_descriptor(raw):
return None
return BlobDescriptor.deserialize(raw)
blob = Blob.from_bytes(value, self.file_io)
return blob.to_data() if blob is not None else None

def _assign_row_tracking(self, record_batch: RecordBatch) -> RecordBatch:
"""Assign row tracking meta fields (_ROW_ID and _SEQUENCE_NUMBER)."""
Expand Down
15 changes: 15 additions & 0 deletions paimon-python/pypaimon/table/row/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,21 @@ def from_file(file_io, file_path: str, offset: int, length: int) -> 'Blob':
def from_descriptor(uri_reader: UriReader, descriptor: BlobDescriptor) -> 'Blob':
return BlobRef(uri_reader, descriptor)

@staticmethod
def from_bytes(data: Optional[bytes], file_io=None, allow_blob_data: bool = True) -> Optional['Blob']:
if data is None:
return None
if not isinstance(data, (bytes, bytearray)):
raise TypeError(f"Blob.from_bytes expects bytes, got {type(data)}")
data = bytes(data)
if BlobDescriptor.is_blob_descriptor(data) or not allow_blob_data:
if file_io is None:
raise ValueError("file_io is required to resolve BlobDescriptor bytes")
descriptor = BlobDescriptor.deserialize(data)
uri_reader = file_io.uri_reader_factory.create(descriptor.uri)
return BlobRef(uri_reader, descriptor)
return BlobData(data)


class BlobData(Blob):

Expand Down
37 changes: 37 additions & 0 deletions paimon-python/pypaimon/tests/blob_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,43 @@ def test_from_http(self):
self.assertEqual(descriptor.offset, 0)
self.assertEqual(descriptor.length, -1)

def test_from_bytes_with_raw_data(self):
raw = b"hello blob"
blob = Blob.from_bytes(raw)
self.assertIsInstance(blob, BlobData)
self.assertEqual(blob.to_data(), raw)

def test_from_bytes_with_none(self):
self.assertIsNone(Blob.from_bytes(None))

def test_from_bytes_with_descriptor(self):
import tempfile
import os
data = b"actual blob content"
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(data)
tmp.close()

descriptor = BlobDescriptor(tmp.name, 0, len(data))
serialized = descriptor.serialize()

from pypaimon.common.file_io import FileIO
file_io = FileIO.get(f"file://{os.path.dirname(tmp.name)}", {})
blob = Blob.from_bytes(serialized, file_io)
self.assertIsInstance(blob, BlobRef)
self.assertEqual(blob.to_data(), data)
os.unlink(tmp.name)

def test_from_bytes_descriptor_without_file_io_raises(self):
descriptor = BlobDescriptor("/tmp/fake", 0, 10)
serialized = descriptor.serialize()
with self.assertRaises(ValueError):
Blob.from_bytes(serialized)

def test_from_bytes_invalid_type_raises(self):
with self.assertRaises(TypeError):
Blob.from_bytes(12345)

def test_blob_data_interface_compliance(self):
"""Test that BlobData properly implements Blob interface."""
test_data = b"interface test data"
Expand Down
Loading