From f0afc77f42f9fca7a423fa0eeb205b220ad3dd94 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Dec 2025 14:10:28 +0100 Subject: [PATCH] [Python] Compat with pandas 3.0 changed default datetime unit --- python/pyarrow/pandas_compat.py | 4 ++-- python/pyarrow/tests/test_pandas.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index dfed76d3711..dfca59cbf5f 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -140,7 +140,7 @@ def get_extension_dtype_info(column): physical_dtype = str(cats.codes.dtype) elif hasattr(dtype, 'tz'): metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)} - physical_dtype = 'datetime64[ns]' + physical_dtype = f'datetime64[{dtype.unit}]' else: metadata = None physical_dtype = str(dtype) @@ -1188,7 +1188,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): if _pandas_api.is_ge_v3(): # with pandas 3+, to_datetime returns a unit depending on the string # data, so we restore it to the original unit from the metadata - level = level.as_unit(np.datetime_data(dtype)[0]) + level = level.as_unit(np.datetime_data(numpy_dtype)[0]) # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 7f9b04eaabd..e41eb3d3801 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -212,13 +212,14 @@ def test_column_index_names_are_preserved(self): df.columns.names = ['a'] _check_pandas_roundtrip(df, preserve_index=True) - def test_column_index_names_with_tz(self): + @pytest.mark.parametrize("tz", [None, "Europe/Brussels"]) + def test_column_index_names_datetime(self, tz): # ARROW-13756 # Bug if index is timezone aware DataTimeIndex df = pd.DataFrame( np.random.randn(5, 3), - columns=pd.date_range("2021-01-01", periods=3, freq="50D", tz="CET") + columns=pd.date_range("2021-01-01", periods=3, freq="50D", tz=tz) ) _check_pandas_roundtrip(df, preserve_index=True) @@ -451,7 +452,7 @@ def test_datetimetz_column_index(self): df = pd.DataFrame( [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)], columns=pd.date_range( - start='2017-01-01', periods=3, tz='America/New_York' + start='2017-01-01', periods=3, tz='America/New_York', unit='us' ) ) t = pa.Table.from_pandas(df, preserve_index=True) @@ -460,7 +461,7 @@ def test_datetimetz_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] is None assert column_indexes['pandas_type'] == 'datetimetz' - assert column_indexes['numpy_type'] == 'datetime64[ns]' + assert column_indexes['numpy_type'] == 'datetime64[us]' md = column_indexes['metadata'] assert md['timezone'] == 'America/New_York' @@ -709,7 +710,8 @@ def test_mismatch_metadata_schema(self): # It is possible that the metadata and actual schema is not fully # matching (eg no timezone information for tz-aware column) # -> to_pandas() conversion should not fail on that - df = pd.DataFrame({"datetime": pd.date_range("2020-01-01", periods=3)}) + df = pd.DataFrame({"datetime": pd.date_range( + "2020-01-01", periods=3, unit='ns')}) # OPTION 1: casting after conversion table = pa.Table.from_pandas(df)