diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2013f81d4da18..336ad8eee3be3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -614,6 +614,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) +- Bug in :class:`ArrowDtype` where ``.convert_dtypes(dtype_backend="pyarrow")`` stripped timezone information from timezone-aware PyArrow timestamps, resulting in a loss of timezone data. This has been fixed to ensure timezone information is preserved during conversions. (:issue:`60237`) - Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) @@ -638,7 +639,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Fixed an issue where ``.convert_dtypes(dtype_backend="pyarrow")`` stripped timezone information from timezone-aware PyArrow timestamps. Timezone data is now correctly preserved during conversions. (:issue:`60237`) - Numeric diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index a2c3a706ae29c..6ec077cecd63b 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -108,7 +108,7 @@ def _evaluate_numexpr(op, op_str, left_op, right_op): try: result = ne.evaluate( f"left_value {op_str} right_value", - local_dict={"left_value": left_value, "right_value": right_op}, + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: @@ -257,7 +257,11 @@ def where(cond, left_op, right_op, use_numexpr: bool = True): Whether to try to use numexpr. """ assert _where is not None - return _where(cond, left_op, right_op) if use_numexpr else _where_standard(cond, left_op, right_op) + return ( + _where(cond, left_op, right_op) + if use_numexpr + else _where_standard(cond, left_op, right_op) + ) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 4a75acce46632..166c9d47294cd 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -274,7 +274,9 @@ def stringify(value): # string quoting return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 02b9291da9b31..b63145f6b8ad5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1113,7 +1113,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - if dtype_backend == "pyarrow": + if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype): from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1dd1b12d6ae95..788e1cf2bb3ca 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2274,26 +2274,31 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: - """Return an instance of the related numpy dtype""" - if pa.types.is_timestamp(self.pyarrow_dtype): - # pa.timestamp(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow timestamp units. - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - if pa.types.is_duration(self.pyarrow_dtype): - # pa.duration(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow duration units - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( - self.pyarrow_dtype - ): - # pa.string().to_pandas_dtype() = object which we don't want + """Return an instance of the related numpy dtype.""" + pa_type = self.pyarrow_dtype + + # handle tz-aware timestamps + if pa.types.is_timestamp(pa_type): + if pa_type.tz is not None: + # preserve tz by NOT calling numpy_dtype for this dtype. + return np.dtype("datetime64[ns]") + else: + # For tz-naive timestamps, just return the corresponding unit + return np.dtype(f"datetime64[{pa_type.unit}]") + + if pa.types.is_duration(pa_type): + return np.dtype(f"timedelta64[{pa_type.unit}]") + + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): return np.dtype(str) + try: - return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + np_dtype = pa_type.to_pandas_dtype() + if isinstance(np_dtype, DatetimeTZDtype): + # In theory we shouldn't get here for tz-aware arrow timestamps + # if we've handled them above. This is a fallback. + return np.dtype("datetime64[ns]") + return np.dtype(np_dtype) except (NotImplementedError, TypeError): return np.dtype(object) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c6ac6368f2770..1942f481684cb 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3505,3 +3505,19 @@ def test_map_numeric_na_action(): result = ser.map(lambda x: 42, na_action="ignore") expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "tz", ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] +) +def test_pyarrow_timestamp_tz_preserved(tz): + s = pd.Series( + pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), + dtype=f"timestamp[ns, tz={tz}][pyarrow]", + ) + + result = s.convert_dtypes(dtype_backend="pyarrow") + assert result.dtype == s.dtype, f"Expected {s.dtype}, got {result.dtype}" + + assert str(result.iloc[0].tzinfo) == str(s.iloc[0].tzinfo) + tm.assert_series_equal(result, s)