Skip to content

Commit 891d0f2

Browse files
committed
Update metadata serialization to use base64 encoding and increment version to 0.21.2
1 parent 3d02d40 commit 891d0f2

File tree

3 files changed

+41
-10
lines changed

3 files changed

+41
-10
lines changed

pydala/metadata.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import posixpath
66
import re
77
import tempfile
8+
import base64
89
from collections import defaultdict
910
from typing import Any
1011

@@ -26,12 +27,22 @@
2627

2728

2829
def serialize_metadata(metadata: dict[str, pq.FileMetaData]) -> dict[str, Any]:
29-
"""Safely serialize metadata to JSON-compatible format."""
30-
result = {}
30+
"""Safely serialize metadata to a JSON-compatible format.
31+
32+
We write each ``FileMetaData`` to an in-memory buffer using
33+
``write_metadata_file`` and base64-encode the resulting bytes for JSON.
34+
"""
35+
result: dict[str, Any] = {}
3136
for path, meta in metadata.items():
32-
# Convert FileMetaData to serializable format
37+
# Write FileMetaData to an in-memory buffer
38+
sink = pa.BufferOutputStream()
39+
meta.write_metadata_file(sink)
40+
buf = sink.getvalue() # pyarrow.Buffer
41+
raw: bytes = buf.to_pybytes()
42+
# Base64 encode to make it JSON serializable
43+
b64 = base64.b64encode(raw).decode("ascii")
3344
result[path] = {
34-
"serialized_metadata": meta.serialize(),
45+
"serialized_metadata_b64": b64,
3546
"num_rows": meta.num_rows,
3647
"num_columns": len(meta.schema),
3748
"created_by": meta.created_by,
@@ -41,12 +52,32 @@ def serialize_metadata(metadata: dict[str, pq.FileMetaData]) -> dict[str, Any]:
4152

4253

4354
def deserialize_metadata(data: dict[str, Any]) -> dict[str, pq.FileMetaData]:
44-
"""Safely deserialize metadata from JSON-compatible format."""
45-
result = {}
55+
"""Safely deserialize metadata from JSON-compatible format.
56+
57+
Expects base64-encoded bytes under the key ``serialized_metadata_b64``.
58+
Falls back to ``serialized_metadata`` if present and already a base64 string.
59+
"""
60+
result: dict[str, pq.FileMetaData] = {}
4661
for path, meta_data in data.items():
4762
# Reconstruct FileMetaData from serialized data
48-
buf = pa.py_buffer(meta_data["serialized_metadata"])
49-
result[path] = pq.read_metadata(buf)
63+
if "serialized_metadata_b64" in meta_data:
64+
raw_bytes = base64.b64decode(meta_data["serialized_metadata_b64"]) # type: ignore[arg-type]
65+
else:
66+
# Backward-compatibility: if an older key is present and contains a
67+
# base64 string, try decoding it; if it's bytes-like, use directly.
68+
sm = meta_data.get("serialized_metadata")
69+
if isinstance(sm, str):
70+
raw_bytes = base64.b64decode(sm)
71+
elif isinstance(sm, (bytes, bytearray)):
72+
raw_bytes = bytes(sm)
73+
else:
74+
raise ValueError(
75+
"Serialized metadata missing or in unknown format for path: "
76+
+ str(path)
77+
)
78+
79+
reader = pa.BufferReader(raw_bytes)
80+
result[path] = pq.read_metadata(reader)
5081
return result
5182

5283

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ name = "pydala2"
2424
readme = "README.md"
2525
#repository = "https://github.com/legout/pydala2"
2626
requires-python = ">= 3.11"
27-
version = "0.21.1"
27+
version = "0.21.2"
2828

2929
[project.optional-dependencies]
3030
legacy = ["polars-lts-cpu>=0.20.4"]

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)