Skip to content

Commit 0d72f7e

Browse files
authored
GH-45382: [Python] Add support for pandas DataFrame.attrs (#47147)
### Rationale for this change Please see Issue #45382 ### What changes are included in this PR? Add support for pandas' attributes in metadata when writing to or reading from .parquet ### Are these changes tested? Yes, though the current implementation depends on pandas' which has similar functionality ### Are there any user-facing changes? Pandas will no longer need to work around the metadata handling on their side * GitHub Issue: #45382 Authored-by: Bogdan Romenskii <[email protected]> Signed-off-by: Rok Mihevc <[email protected]>
1 parent 1e240c2 commit 0d72f7e

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

python/pyarrow/pandas_compat.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,11 +275,14 @@ def construct_metadata(columns_to_convert, df, column_names, index_levels,
275275
else:
276276
index_descriptors = index_column_metadata = column_indexes = []
277277

278+
attributes = df.attrs if hasattr(df, "attrs") else {}
279+
278280
return {
279281
b'pandas': json.dumps({
280282
'index_columns': index_descriptors,
281283
'column_indexes': column_indexes,
282284
'columns': column_metadata + index_column_metadata,
285+
'attributes': attributes,
283286
'creator': {
284287
'library': 'pyarrow',
285288
'version': pa.__version__
@@ -782,11 +785,13 @@ def table_to_dataframe(
782785
):
783786
all_columns = []
784787
column_indexes = []
788+
attributes = {}
785789
pandas_metadata = table.schema.pandas_metadata
786790

787791
if not ignore_metadata and pandas_metadata is not None:
788792
all_columns = pandas_metadata['columns']
789793
column_indexes = pandas_metadata.get('column_indexes', [])
794+
attributes = pandas_metadata.get('attributes', {})
790795
index_descriptors = pandas_metadata['index_columns']
791796
table = _add_any_metadata(table, pandas_metadata)
792797
table, index = _reconstruct_index(table, index_descriptors,
@@ -814,6 +819,8 @@ def table_to_dataframe(
814819
for item in result
815820
]
816821
df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
822+
df.attrs = attributes
823+
817824
return df
818825
else:
819826
from pandas.core.internals import BlockManager
@@ -829,6 +836,9 @@ def table_to_dataframe(
829836
df = DataFrame._from_mgr(mgr, mgr.axes)
830837
else:
831838
df = DataFrame(mgr)
839+
840+
df.attrs = attributes
841+
832842
return df
833843

834844

python/pyarrow/tests/parquet/test_pandas.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,31 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
101101
writer.write_table(table2)
102102

103103

104+
@pytest.mark.pandas
105+
def test_attributes_metadata_persistence(tempdir):
106+
# GH-45382: Add support for pandas DataFrame.attrs
107+
# During the .parquet file writing, the attrs are serialised into json
108+
# along with the rest of the pandas.DataFrame metadata.
109+
110+
filename = tempdir / "metadata_persistence.parquet"
111+
df = alltypes_sample(size=10000)
112+
df.attrs = {
113+
'float16': 'half-precision',
114+
'float32': 'single precision',
115+
'float64': 'double precision',
116+
'desciption': 'Attributes Persistence Test DataFrame',
117+
}
118+
119+
table = pa.Table.from_pandas(df)
120+
assert b'attributes' in table.schema.metadata[b'pandas']
121+
122+
_write_table(table, filename)
123+
metadata = pq.read_metadata(filename).metadata
124+
js = json.loads(metadata[b'pandas'].decode('utf8'))
125+
assert 'attributes' in js
126+
assert js['attributes'] == df.attrs
127+
128+
104129
@pytest.mark.pandas
105130
def test_pandas_parquet_column_multiindex(tempdir):
106131
df = alltypes_sample(size=10)

0 commit comments

Comments
 (0)