From 6c99f92bdff39daf4ee8c24f2a2f27a2302480ca Mon Sep 17 00:00:00 2001 From: road Date: Wed, 13 Nov 2024 12:05:11 -0500 Subject: [PATCH 1/9] removed if blocks, added test, and added to whats new --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/dtypes/dtypes.py | 12 ------------ pandas/tests/frame/methods/test_convert_dtypes.py | 8 ++++++++ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index de69166b8c196..03e64319ff791 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -649,6 +649,7 @@ Conversion - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) +- Bug in :meth: 'Series.convert_dtype' strips the timezone on an already Timezone aware pyarrow timestamp dtype (:issue:'60237') Strings ^^^^^^^ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 96b0aa16940a6..67c09884d03dd 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2277,18 +2277,6 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" - if pa.types.is_timestamp(self.pyarrow_dtype): - # pa.timestamp(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow timestamp units. - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - if pa.types.is_duration(self.pyarrow_dtype): - # pa.duration(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow duration units - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( self.pyarrow_dtype ): diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index e7f6e5d625d3e..bf5a7da44ceb7 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -196,3 +196,11 @@ def test_convert_dtypes_from_arrow(self): result = df.convert_dtypes() expected = df.astype({"a": "string[python]"}) tm.assert_frame_equal(result, expected) + + def test_convert_dtypes_timezone_series(self): + # GH#60237 + ser = pd.Series(pd.date_range(start='2020-01-01', periods=5, freq='h', tz='UTC')) + ser = ser.astype("timestamp[ns, tz=UTC][pyarrow]") + expected = ser + result = ser.convert_dtypes(dtype_backend="pyarrow") + tm.assert_series_equal(result, expected) \ No newline at end of file From b578b6f577c4c1cb5f2b69d7015507455bec3f29 Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Thu, 14 Nov 2024 20:39:45 -0500 Subject: [PATCH 2/9] added duration if statement --- pandas/core/dtypes/dtypes.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 67c09884d03dd..ba30f929333a5 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2277,6 +2277,12 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" + if pa.types.is_duration(self.pyarrow_dtype): + # pa.duration(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow duration units + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( self.pyarrow_dtype ): From 7d079ace8af673f236f1d41f7519608864b83289 Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Thu, 14 Nov 2024 20:45:41 -0500 Subject: [PATCH 3/9] fixing line length in test file --- pandas/tests/frame/methods/test_convert_dtypes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index bf5a7da44ceb7..165abe089b479 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -199,8 +199,11 @@ def test_convert_dtypes_from_arrow(self): def test_convert_dtypes_timezone_series(self): # GH#60237 - ser = pd.Series(pd.date_range(start='2020-01-01', periods=5, freq='h', tz='UTC')) + ser = pd.Series(pd.date_range(start='2020-01-01', + periods=5, + freq='h', + tz='UTC')) ser = ser.astype("timestamp[ns, tz=UTC][pyarrow]") expected = ser result = ser.convert_dtypes(dtype_backend="pyarrow") - tm.assert_series_equal(result, expected) \ No newline at end of file + tm.assert_series_equal(result, expected) From cab7b557c3634ff0a9d25d689c7fcc221e121967 Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Tue, 19 Nov 2024 12:47:46 -0500 Subject: [PATCH 4/9] re-remove if --- pandas/core/dtypes/dtypes.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ba30f929333a5..67c09884d03dd 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2277,12 +2277,6 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" - if pa.types.is_duration(self.pyarrow_dtype): - # pa.duration(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow duration units - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( self.pyarrow_dtype ): From d1bfc5363ccffff0bd2b728f28a7658968098fef Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Tue, 19 Nov 2024 12:48:27 -0500 Subject: [PATCH 5/9] changing test location --- pandas/tests/frame/methods/test_convert_dtypes.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 165abe089b479..e7f6e5d625d3e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -196,14 +196,3 @@ def test_convert_dtypes_from_arrow(self): result = df.convert_dtypes() expected = df.astype({"a": "string[python]"}) tm.assert_frame_equal(result, expected) - - def test_convert_dtypes_timezone_series(self): - # GH#60237 - ser = pd.Series(pd.date_range(start='2020-01-01', - periods=5, - freq='h', - tz='UTC')) - ser = ser.astype("timestamp[ns, tz=UTC][pyarrow]") - expected = ser - result = ser.convert_dtypes(dtype_backend="pyarrow") - tm.assert_series_equal(result, expected) From 5ae19ae7852431ec814d77f7448154827c8bb8f6 Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Tue, 19 Nov 2024 12:49:24 -0500 Subject: [PATCH 6/9] changing test location to test_arrow.py --- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f0ff11e5fa3f7..90d50dc086bc7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3507,3 +3507,13 @@ def test_map_numeric_na_action(): result = ser.map(lambda x: 42, na_action="ignore") expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) + +def test_convert_dtypes_timezone_series(): + # GH#60237 + ser = pd.Series(pd.date_range(start='2020-01-01', + periods=5, freq='h', + tz='UTC'), + dtype="timestamp[ns, tz=UTC][pyarrow]") + expected = ser + result = ser.convert_dtypes(dtype_backend="pyarrow") + tm.assert_series_equal(result, expected) From ae52904a18d299699be9adbb1fe081a1c3c693ca Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:02:53 -0500 Subject: [PATCH 7/9] added if statement to pre-existing if statements to check for tz --- pandas/core/dtypes/dtypes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 67c09884d03dd..de47d0c1b3cd0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2277,6 +2277,21 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" + if pa.types.is_timestamp(self.pyarrow_dtype): + # pa.timestamp(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow timestamp units. + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + if self.pyarrow_dtype.tz is not None: + np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + else: + return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") + if pa.types.is_duration(self.pyarrow_dtype): + # pa.duration(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow duration units + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( self.pyarrow_dtype ): From 67f55f33ee09ce799c5513a5130b5c1e431d2780 Mon Sep 17 00:00:00 2001 From: road-1 <165108013+road-1@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:24:59 -0500 Subject: [PATCH 8/9] simplified if statements --- pandas/core/dtypes/dtypes.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index de47d0c1b3cd0..16c52b0b1d674 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2282,9 +2282,7 @@ def numpy_dtype(self) -> np.dtype: # regardless of the pyarrow timestamp units. # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 - if self.pyarrow_dtype.tz is not None: - np.dtype(self.pyarrow_dtype.to_pandas_dtype()) - else: + if self.pyarrow_dtype.tz is None: return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") if pa.types.is_duration(self.pyarrow_dtype): # pa.duration(unit).to_pandas_dtype() returns ns units From adfcf405b143637aa59ad1a02173563765b5674a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Nov 2024 17:46:58 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/extension/test_arrow.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 426350f45f232..1033dabef75b6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3508,12 +3508,13 @@ def test_map_numeric_na_action(): expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) + def test_convert_dtypes_timezone_series(): # GH#60237 - ser = pd.Series(pd.date_range(start='2020-01-01', - periods=5, freq='h', - tz='UTC'), - dtype="timestamp[ns, tz=UTC][pyarrow]") + ser = pd.Series( + pd.date_range(start="2020-01-01", periods=5, freq="h", tz="UTC"), + dtype="timestamp[ns, tz=UTC][pyarrow]", + ) expected = ser result = ser.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected)