55import posixpath
66import re
77import tempfile
8+ import base64
89from collections import defaultdict
910from typing import Any
1011
2627
2728
2829def serialize_metadata (metadata : dict [str , pq .FileMetaData ]) -> dict [str , Any ]:
29- """Safely serialize metadata to JSON-compatible format."""
30- result = {}
30+ """Safely serialize metadata to a JSON-compatible format.
31+
32+ We write each ``FileMetaData`` to an in-memory buffer using
33+ ``write_metadata_file`` and base64-encode the resulting bytes for JSON.
34+ """
35+ result : dict [str , Any ] = {}
3136 for path , meta in metadata .items ():
32- # Convert FileMetaData to serializable format
37+ # Write FileMetaData to an in-memory buffer
38+ sink = pa .BufferOutputStream ()
39+ meta .write_metadata_file (sink )
40+ buf = sink .getvalue () # pyarrow.Buffer
41+ raw : bytes = buf .to_pybytes ()
42+ # Base64 encode to make it JSON serializable
43+ b64 = base64 .b64encode (raw ).decode ("ascii" )
3344 result [path ] = {
34- "serialized_metadata " : meta . serialize () ,
45+ "serialized_metadata_b64 " : b64 ,
3546 "num_rows" : meta .num_rows ,
3647 "num_columns" : len (meta .schema ),
3748 "created_by" : meta .created_by ,
@@ -41,12 +52,32 @@ def serialize_metadata(metadata: dict[str, pq.FileMetaData]) -> dict[str, Any]:
4152
4253
4354def deserialize_metadata (data : dict [str , Any ]) -> dict [str , pq .FileMetaData ]:
44- """Safely deserialize metadata from JSON-compatible format."""
45- result = {}
55+ """Safely deserialize metadata from JSON-compatible format.
56+
57+ Expects base64-encoded bytes under the key ``serialized_metadata_b64``.
58+ Falls back to ``serialized_metadata`` if present and already a base64 string.
59+ """
60+ result : dict [str , pq .FileMetaData ] = {}
4661 for path , meta_data in data .items ():
4762 # Reconstruct FileMetaData from serialized data
48- buf = pa .py_buffer (meta_data ["serialized_metadata" ])
49- result [path ] = pq .read_metadata (buf )
63+ if "serialized_metadata_b64" in meta_data :
64+ raw_bytes = base64 .b64decode (meta_data ["serialized_metadata_b64" ]) # type: ignore[arg-type]
65+ else :
66+ # Backward-compatibility: if an older key is present and contains a
67+ # base64 string, try decoding it; if it's bytes-like, use directly.
68+ sm = meta_data .get ("serialized_metadata" )
69+ if isinstance (sm , str ):
70+ raw_bytes = base64 .b64decode (sm )
71+ elif isinstance (sm , (bytes , bytearray )):
72+ raw_bytes = bytes (sm )
73+ else :
74+ raise ValueError (
75+ "Serialized metadata missing or in unknown format for path: "
76+ + str (path )
77+ )
78+
79+ reader = pa .BufferReader (raw_bytes )
80+ result [path ] = pq .read_metadata (reader )
5081 return result
5182
5283
0 commit comments