GH-45382: [Python] Add support for pandas DataFrame.attrs (#47147)

rmnskb · web-flow · commit 0d72f7e8f33a · 2025-08-09T11:44:00.000+02:00
### Rationale for this change Please see Issue #45382 ### What changes are included in this PR? Add support for pandas' attributes in metadata when writing to or reading from .parquet ### Are these changes tested? Yes, though the current implementation depends on pandas' which has similar functionality ### Are there any user-facing changes? Pandas will no longer need to work around the metadata handling on their side * GitHub Issue: #45382 Authored-by: Bogdan Romenskii <rmnsk@seznam.cz> Signed-off-by: Rok Mihevc <rok@mihevc.org>
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -275,11 +275,14 @@ def construct_metadata(columns_to_convert, df, column_names, index_levels,
     else:
         index_descriptors = index_column_metadata = column_indexes = []
 
+    attributes = df.attrs if hasattr(df, "attrs") else {}
+
     return {
         b'pandas': json.dumps({
             'index_columns': index_descriptors,
             'column_indexes': column_indexes,
             'columns': column_metadata + index_column_metadata,
+            'attributes': attributes,
             'creator': {
                 'library': 'pyarrow',
                 'version': pa.__version__
@@ -782,11 +785,13 @@ def table_to_dataframe(
 ):
     all_columns = []
     column_indexes = []
+    attributes = {}
     pandas_metadata = table.schema.pandas_metadata
 
     if not ignore_metadata and pandas_metadata is not None:
         all_columns = pandas_metadata['columns']
         column_indexes = pandas_metadata.get('column_indexes', [])
+        attributes = pandas_metadata.get('attributes', {})
         index_descriptors = pandas_metadata['index_columns']
         table = _add_any_metadata(table, pandas_metadata)
         table, index = _reconstruct_index(table, index_descriptors,
@@ -814,6 +819,8 @@ def table_to_dataframe(
             for item in result
         ]
         df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
+        df.attrs = attributes
+
         return df
     else:
         from pandas.core.internals import BlockManager
@@ -829,6 +836,9 @@ def table_to_dataframe(
             df = DataFrame._from_mgr(mgr, mgr.axes)
         else:
             df = DataFrame(mgr)
+
+        df.attrs = attributes
+
         return df
 
 
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -101,6 +101,31 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
     writer.write_table(table2)
 
 
+@pytest.mark.pandas
+def test_attributes_metadata_persistence(tempdir):
+    # GH-45382: Add support for pandas DataFrame.attrs
+    # During the .parquet file writing, the attrs are serialised into json
+    # along with the rest of the pandas.DataFrame metadata.
+
+    filename = tempdir / "metadata_persistence.parquet"
+    df = alltypes_sample(size=10000)
+    df.attrs = {
+        'float16': 'half-precision',
+        'float32': 'single precision',
+        'float64': 'double precision',
+        'desciption': 'Attributes Persistence Test DataFrame',
+    }
+
+    table = pa.Table.from_pandas(df)
+    assert b'attributes' in table.schema.metadata[b'pandas']
+
+    _write_table(table, filename)
+    metadata = pq.read_metadata(filename).metadata
+    js = json.loads(metadata[b'pandas'].decode('utf8'))
+    assert 'attributes' in js
+    assert js['attributes'] == df.attrs
+
+
 @pytest.mark.pandas
 def test_pandas_parquet_column_multiindex(tempdir):
     df = alltypes_sample(size=10)