From 68b6c7c1cf3cef86d48a5fc52f4d56d31f8b52b2 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 21 Apr 2024 22:33:00 +0200 Subject: [PATCH 001/163] Remove cast to numpy for series supporting NA as na_value in map function --- pandas/core/arrays/masked.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 190888d281ea9..51c3d8d326d38 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1317,7 +1317,10 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action=None): - return map_array(self.to_numpy(), mapper, na_action=na_action) + if self.dtype.na_value is libmissing.NA: + return map_array(self, mapper, na_action=na_action) + else: + return map_array(self.to_numpy(), mapper, na_action=na_action) @overload def any( From dcc8dab2e3c780393a612e1edcd0ef161e37d7cf Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 21 Apr 2024 22:41:30 +0200 Subject: [PATCH 002/163] Add test for map operation applied on series supporting NA as na_value --- pandas/tests/series/methods/test_map.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index f4f72854e50d3..d14b967853580 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -261,6 +261,13 @@ def test_map_int(): assert not isna(merged["c"]) +def test_map_int_with_pd_na(): + s = Series([pd.NA, 42], dtype="Int64") + result = s.map(lambda x: 1 if x is pd.NA else 2) + expected = Series([1, 2]) + tm.assert_series_equal(result, expected) + + def test_map_type_inference(): s = Series(range(3)) s2 = s.map(lambda x: np.where(x == 0, 0, 1)) From 19215b79b52deb64f4840ed9d4c7a3b8204aeef5 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:35:38 +0200 Subject: [PATCH 003/163] Adapt test_map test to take into account series containing pd.NA --- pandas/tests/extension/test_masked.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 69ce42203d510..431c9b30c1575 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -17,6 +17,7 @@ import numpy as np import pytest +from pandas._libs import missing as libmissing from pandas.compat import ( IS64, is_platform_windows, @@ -171,17 +172,13 @@ class TestMaskedArrays(base.ExtensionTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == Float32Dtype(): - # map roundtrips through objects, which converts to float64 - expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) - else: - expected = data_missing.to_numpy() + expected = data_missing.astype(object, copy=True) tm.assert_numpy_array_equal(result, expected) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") - if data_missing_for_sorting.dtype.kind == "b": + if data_missing_for_sorting.dtype.na_value is libmissing.NA: expected = np.array([False, pd.NA, False], dtype=object) else: expected = np.array([zero, np.nan, zero]) From 616620c837329334c81a115bdf2ae8190f470ae9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 26 Apr 2024 20:34:34 +0200 Subject: [PATCH 004/163] Add an entry in Conversion section (issue 57390) --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c77348b365370..a78607ad2bdb0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -388,6 +388,7 @@ Conversion - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) +- Bug in :meth:`BaseMaskedArray.map` was casting ``pd.NA`` to ``np.nan``. (:issue:`57390`) Strings ^^^^^^^ From 8473e73b7bb1ebb52853bd419c3779ad4659b7c3 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 26 Apr 2024 20:40:55 +0200 Subject: [PATCH 005/163] Correct whatsnew order with pre commit --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a78607ad2bdb0..b1c2febcad0de 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -385,10 +385,10 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :meth:`BaseMaskedArray.map` was casting ``pd.NA`` to ``np.nan``. (:issue:`57390`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) -- Bug in :meth:`BaseMaskedArray.map` was casting ``pd.NA`` to ``np.nan``. (:issue:`57390`) Strings ^^^^^^^ From a17d8b53d4748227cc0cec455bb8f87aacf0acd5 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:18:31 +0200 Subject: [PATCH 006/163] Add the possibility to process pd.NA values --- pandas/_libs/lib.pyi | 4 ++++ pandas/_libs/lib.pyx | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index daaaacee3487d..ec679f7bfd9c9 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -74,6 +74,8 @@ def map_infer( *, convert: Literal[False], ignore_na: bool = ..., + mask: np.ndarray = ..., + na_value: Any = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -82,6 +84,8 @@ def map_infer( *, convert: bool = ..., ignore_na: bool = ..., + mask: np.ndarray = ..., + na_value: Any = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6a31ce84ed418..a4b9c1df5bc6a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2915,7 +2915,13 @@ def map_infer_mask( @cython.boundscheck(False) @cython.wraparound(False) def map_infer( - ndarray arr, object f, *, bint convert=True, bint ignore_na=False + ndarray arr, + object f, + *, + bint convert=True, + bint ignore_na=False, + const uint8_t[:] mask=None, + object na_value=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2927,6 +2933,10 @@ def map_infer( convert : bint ignore_na : bint If True, NA values will not have f applied + mask : ndarray, optional + uint8 dtype ndarray indicating na_value to apply `f` to. + na_value : Any, optional + The input value to use for masked values. Returns ------- @@ -2943,7 +2953,10 @@ def map_infer( if ignore_na and checknull(arr[i]): result[i] = arr[i] continue - val = f(arr[i]) + elif mask is not None and na_value is not None and mask[i]: + val = f(na_value) + else: + val = f(arr[i]) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 From 1f2965cb368780736b195c7681ea18e7588567b5 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:19:37 +0200 Subject: [PATCH 007/163] Add the possibility to process pd.NA values --- pandas/core/algorithms.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 33beef23197bd..0010c1548a107 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -22,6 +22,7 @@ hashtable as htable, iNaT, lib, + missing as libmissing, ) from pandas._typing import ( AnyArrayLike, @@ -1679,8 +1680,25 @@ def map_array( return arr.copy() # we must convert to python types - values = arr.astype(object, copy=False) + values_as_object = arr.astype(object, copy=False) if na_action is None: - return lib.map_infer(values, mapper) + if ( + isinstance(arr, ABCExtensionArray) + and arr._hasna + and arr.dtype.na_value is libmissing.NA + ): + return lib.map_infer( + arr._data, + mapper, + mask=isna(values_as_object).view(np.uint8), + na_value=arr._na_value, + ) + else: + return lib.map_infer(arr.astype(object, copy=False), mapper) else: - return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8)) + return lib.map_infer_mask( + arr.astype(object, copy=False), + mapper, + mask=isna(values_as_object).view(np.uint8), + convert=True, + ) From 70c2b8ac5078bc590c4d2199dde5ca3e99190aed Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:20:21 +0200 Subject: [PATCH 008/163] Remove test ambiguity with pd.NA processing --- pandas/tests/series/methods/test_map.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index d14b967853580..5e214735086b3 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -262,9 +262,9 @@ def test_map_int(): def test_map_int_with_pd_na(): - s = Series([pd.NA, 42], dtype="Int64") - result = s.map(lambda x: 1 if x is pd.NA else 2) - expected = Series([1, 2]) + s = Series([pd.NA, 11, 22, pd.NA], dtype="Int64") + result = s.map(lambda x: 5 if x is pd.NA else 2 * x) + expected = Series([5, 22, 44, 5]) tm.assert_series_equal(result, expected) From c7fe27bd9e6a4580c644fb7294ea2302aca16e48 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 11:42:06 +0000 Subject: [PATCH 009/163] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c1d9009493a5b..de797a3ed179d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1323,7 +1323,7 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): return map_array(self, mapper, na_action=na_action) else: return map_array(self.to_numpy(), mapper, na_action=na_action) - + @overload def any( self, *, skipna: Literal[True] = ..., axis: AxisInt | None = ..., **kwargs From 492d167bb86e7026fc801d8922f26e33d1db7dc9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:53:05 +0200 Subject: [PATCH 010/163] Code clean up --- pandas/core/algorithms.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0010c1548a107..82a3a1120d88e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1697,8 +1697,5 @@ def map_array( return lib.map_infer(arr.astype(object, copy=False), mapper) else: return lib.map_infer_mask( - arr.astype(object, copy=False), - mapper, - mask=isna(values_as_object).view(np.uint8), - convert=True, + values_as_object, mapper, mask=isna(values_as_object).view(np.uint8) ) From 49596c90f3cec0376d2585082f10b19dea7a4b7e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 29 May 2024 10:20:19 +0200 Subject: [PATCH 011/163] Limit NA management to BooleanArray, FloatingArray and IntegerArray types --- pandas/core/algorithms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 82a3a1120d88e..73af8c79bacc6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1683,7 +1683,7 @@ def map_array( values_as_object = arr.astype(object, copy=False) if na_action is None: if ( - isinstance(arr, ABCExtensionArray) + isinstance(arr.dtype, BaseMaskedDtype) and arr._hasna and arr.dtype.na_value is libmissing.NA ): @@ -1691,10 +1691,10 @@ def map_array( arr._data, mapper, mask=isna(values_as_object).view(np.uint8), - na_value=arr._na_value, + na_value=arr.dtype.na_value, ) else: - return lib.map_infer(arr.astype(object, copy=False), mapper) + return lib.map_infer(values_as_object, mapper) else: return lib.map_infer_mask( values_as_object, mapper, mask=isna(values_as_object).view(np.uint8) From 5eb85ff20c4e6acd3757392bc35753a7be610896 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 29 May 2024 23:03:41 +0200 Subject: [PATCH 012/163] Try to correct BaseMaskedArray cast error detected by mypy --- pandas/core/algorithms.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 73af8c79bacc6..b5a7ede24f8f9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1682,17 +1682,15 @@ def map_array( # we must convert to python types values_as_object = arr.astype(object, copy=False) if na_action is None: - if ( - isinstance(arr.dtype, BaseMaskedDtype) - and arr._hasna - and arr.dtype.na_value is libmissing.NA - ): - return lib.map_infer( - arr._data, - mapper, - mask=isna(values_as_object).view(np.uint8), - na_value=arr.dtype.na_value, - ) + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + if arr._hasna and arr.dtype.na_value is libmissing.NA: + return lib.map_infer( + arr._data, + mapper, + mask=isna(values_as_object).view(np.uint8), + na_value=arr.dtype.na_value, + ) else: return lib.map_infer(values_as_object, mapper) else: From 4f5cfe54a4242e654d8facb6edd3135f91777839 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 29 May 2024 23:32:44 +0200 Subject: [PATCH 013/163] Correct typo error: missing else condition --- pandas/core/algorithms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b5a7ede24f8f9..501519f30fbac 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1691,6 +1691,8 @@ def map_array( mask=isna(values_as_object).view(np.uint8), na_value=arr.dtype.na_value, ) + else: + return lib.map_infer(arr._data, mapper) else: return lib.map_infer(values_as_object, mapper) else: From 32ceaa3ad324315a6d2f34f0101c9d74f63aeaa0 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 30 May 2024 16:52:04 +0200 Subject: [PATCH 014/163] Try to correct mypy error with mask parameter in map_infer --- pandas/_libs/lib.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index ec679f7bfd9c9..a4b2045abfb98 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -74,7 +74,7 @@ def map_infer( *, convert: Literal[False], ignore_na: bool = ..., - mask: np.ndarray = ..., + mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., ) -> np.ndarray: ... @overload @@ -84,7 +84,7 @@ def map_infer( *, convert: bool = ..., ignore_na: bool = ..., - mask: np.ndarray = ..., + mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., ) -> ArrayLike: ... @overload From 816a0d23a6bbc5012dce372de08ec6bee16c9efa Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 31 May 2024 11:01:13 +0200 Subject: [PATCH 015/163] Code clean up: simplify map_infer calls --- pandas/core/algorithms.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 501519f30fbac..62ae5080590ef 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -22,7 +22,6 @@ hashtable as htable, iNaT, lib, - missing as libmissing, ) from pandas._typing import ( AnyArrayLike, @@ -1679,23 +1678,22 @@ def map_array( if not len(arr): return arr.copy() - # we must convert to python types - values_as_object = arr.astype(object, copy=False) - if na_action is None: - if isinstance(arr.dtype, BaseMaskedDtype): - arr = cast("BaseMaskedArray", arr) - if arr._hasna and arr.dtype.na_value is libmissing.NA: - return lib.map_infer( - arr._data, - mapper, - mask=isna(values_as_object).view(np.uint8), - na_value=arr.dtype.na_value, - ) - else: - return lib.map_infer(arr._data, mapper) - else: - return lib.map_infer(values_as_object, mapper) + na_value = None + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + values = arr._data + if arr._hasna: + na_value = arr.dtype.na_value else: - return lib.map_infer_mask( - values_as_object, mapper, mask=isna(values_as_object).view(np.uint8) + # we must convert to python types + values = arr.astype(object, copy=False) + mask = isna(arr) + if na_action is None: + return lib.map_infer( + values, + mapper, + mask=mask, + na_value=na_value, ) + else: + return lib.map_infer_mask(values, mapper, mask=mask) From 7cb37df7926ff61bae0cdbc78fffc3f68a1dd65a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 31 May 2024 12:48:44 +0200 Subject: [PATCH 016/163] Correct values input type for map_infer_mask --- pandas/core/algorithms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 62ae5080590ef..666ef1092e5d7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1679,7 +1679,8 @@ def map_array( return arr.copy() na_value = None - if isinstance(arr.dtype, BaseMaskedDtype): + mask = isna(arr) + if isinstance(arr.dtype, BaseMaskedDtype) and na_action is None: arr = cast("BaseMaskedArray", arr) values = arr._data if arr._hasna: @@ -1687,7 +1688,7 @@ def map_array( else: # we must convert to python types values = arr.astype(object, copy=False) - mask = isna(arr) + if na_action is None: return lib.map_infer( values, From 5d7ad8bec99b2143f52aa43792e9564b09a11da8 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 31 May 2024 17:58:57 +0200 Subject: [PATCH 017/163] Remove unnecessary cast with to_numpy before map_array call --- pandas/core/arrays/masked.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index de797a3ed179d..483ddcdccf274 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1319,10 +1319,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - if self.dtype.na_value is libmissing.NA: - return map_array(self, mapper, na_action=na_action) - else: - return map_array(self.to_numpy(), mapper, na_action=na_action) + return map_array(self, mapper, na_action=na_action) @overload def any( From b9631a3a38859fc1fbdf15797d0003dc0da5e629 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:29:53 +0200 Subject: [PATCH 018/163] Manage ExtensionArray and convert to nullable dtype --- pandas/core/algorithms.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 666ef1092e5d7..579c3e80f4d70 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1685,6 +1685,11 @@ def map_array( values = arr._data if arr._hasna: na_value = arr.dtype.na_value + elif isinstance(arr.dtype, ExtensionDtype) and na_action is None: + arr = cast("ExtensionArray", arr) + values = np.asarray(arr) + if arr._hasna: + na_value = arr.dtype.na_value else: # we must convert to python types values = arr.astype(object, copy=False) @@ -1695,6 +1700,7 @@ def map_array( mapper, mask=mask, na_value=na_value, + convert_to_nullable_dtype=True, ) else: return lib.map_infer_mask(values, mapper, mask=mask) From 6c85b64cc81b7038f67b542091481507e4466445 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:36:57 +0200 Subject: [PATCH 019/163] Add convert_to_nullable_dtype to map_infer (used in maybe_convert_objects) --- pandas/_libs/lib.pyi | 2 ++ pandas/_libs/lib.pyx | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index a4b2045abfb98..8f4c5ec2b4e36 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -76,6 +76,7 @@ def map_infer( ignore_na: bool = ..., mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., + convert_to_nullable_dtype: Literal[False] = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -86,6 +87,7 @@ def map_infer( ignore_na: bool = ..., mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., + convert_to_nullable_dtype: Literal[False] = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e82858f28ef59..b2adc729466e2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2938,6 +2938,7 @@ def map_infer( bint ignore_na=False, const uint8_t[:] mask=None, object na_value=None, + bint convert_to_nullable_dtype=False, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2953,6 +2954,9 @@ def map_infer( uint8 dtype ndarray indicating na_value to apply `f` to. na_value : Any, optional The input value to use for masked values. + convert_to_nullable_dtype : bool, default False + If an array-like object contains only integer or boolean values (and NaN) is + encountered, whether to convert and return an Boolean/IntegerArray. Returns ------- @@ -2981,7 +2985,10 @@ def map_infer( result[i] = val if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype + ) else: return result From c84932f567ef56c6a7bb784e02a954bdc55d95f2 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:28:23 +0200 Subject: [PATCH 020/163] Add convert_to_nullable_dtype to map_infer_mask (used in maybe_convert_objects) --- pandas/_libs/lib.pyi | 2 ++ pandas/_libs/lib.pyx | 6 +++++- pandas/core/algorithms.py | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 8f4c5ec2b4e36..909ac8100b75f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -184,6 +184,7 @@ def map_infer_mask( convert: Literal[False], na_value: Any = ..., dtype: np.dtype = ..., + convert_to_nullable_dtype: bool = ..., ) -> np.ndarray: ... @overload def map_infer_mask( @@ -194,6 +195,7 @@ def map_infer_mask( convert: bool = ..., na_value: Any = ..., dtype: np.dtype = ..., + convert_to_nullable_dtype: bool = ..., ) -> ArrayLike: ... def indices_fast( index: npt.NDArray[np.intp], diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b2adc729466e2..b01dea6c3157c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2870,6 +2870,7 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, + bint convert_to_nullable_dtype=False, cnp.dtype dtype=np.dtype(object) ) -> "ArrayLike": """ @@ -2923,7 +2924,10 @@ def map_infer_mask( PyArray_ITER_NEXT(result_it) if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype + ) else: return result diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 579c3e80f4d70..d1bde1c1dfbd2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1703,4 +1703,6 @@ def map_array( convert_to_nullable_dtype=True, ) else: - return lib.map_infer_mask(values, mapper, mask=mask) + return lib.map_infer_mask( + values, mapper, mask=mask, convert_to_nullable_dtype=True + ) From 421e7799662a006c17ed1d1a77072d9a9d7bb30a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:31:52 +0200 Subject: [PATCH 021/163] Conversion to numpy object is not necessary anymore --- pandas/core/arrays/arrow/array.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3d55513ab914c..cf731fdad8a07 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -53,7 +53,6 @@ ops, roperator, ) -from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays._utils import to_numpy_dtype_inference @@ -1424,10 +1423,8 @@ def to_numpy( return result def map(self, mapper, na_action: Literal["ignore"] | None = None): - if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=na_action) - else: - return super().map(mapper, na_action) + result = super().map(mapper, na_action) + return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) @doc(ExtensionArray.duplicated) def duplicated( From 25c2b90ef26d9f5e7de1aed38ec1996365dc2c18 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:37:55 +0200 Subject: [PATCH 022/163] Tests results are verified as ExtensionArray --- pandas/tests/extension/test_arrow.py | 12 ++++-------- pandas/tests/extension/test_masked.py | 15 +++++++-------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..e7615ad7abc1b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -278,16 +278,12 @@ def test_compare_scalar(self, data, comparison_op): def test_map(self, data_missing, na_action): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.to_numpy(dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected, check_dtype=False) else: result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == "float32[pyarrow]": - # map roundtrips through objects, which converts to float64 - expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) - else: - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected, check_dtype=False) def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 431c9b30c1575..dc09fc9832631 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -17,7 +17,6 @@ import numpy as np import pytest -from pandas._libs import missing as libmissing from pandas.compat import ( IS64, is_platform_windows, @@ -172,17 +171,17 @@ class TestMaskedArrays(base.ExtensionTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.astype(object, copy=True) - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected, check_dtype=False) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] + na_value = data_missing_for_sorting.dtype.na_value result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") - if data_missing_for_sorting.dtype.na_value is libmissing.NA: - expected = np.array([False, pd.NA, False], dtype=object) - else: - expected = np.array([zero, np.nan, zero]) - tm.assert_numpy_array_equal(result, expected) + expected = type(data_missing_for_sorting)._from_sequence( + [zero, na_value, zero], dtype=data_missing_for_sorting.dtype + ) + tm.assert_extension_array_equal(result, expected, check_dtype=False) def _get_expected_exception(self, op_name, obj, other): try: From 23c48d24dcde7875587318ae65aae616cf122eef Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:40:19 +0200 Subject: [PATCH 023/163] Tests was extended to Int64, Float64 and boolean --- pandas/tests/series/methods/test_map.py | 39 ++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 5e214735086b3..3f7db9d8bd9bb 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -261,10 +261,41 @@ def test_map_int(): assert not isna(merged["c"]) -def test_map_int_with_pd_na(): - s = Series([pd.NA, 11, 22, pd.NA], dtype="Int64") - result = s.map(lambda x: 5 if x is pd.NA else 2 * x) - expected = Series([5, 22, 44, 5]) +@pytest.mark.parametrize( + "ser", + [ + Series([pd.NA, 11], dtype="Int64"), + Series([pd.NA, 11.0], dtype="Float64"), + Series([pd.NA, True], dtype="boolean"), + ], +) +def test_map_with_pd_na_input(ser): + func_return_values_only = ( + lambda x: ser.dtype.type(1) if x is pd.NA else ser.dtype.type(2 * x) + ) + result = ser.map(func_return_values_only) + expected = Series( + [func_return_values_only(ser[0]), func_return_values_only(ser[1])], + dtype=ser.dtype, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser", + [ + Series([pd.NA, 11], dtype="Int64"), + Series([pd.NA, 11.0], dtype="Float64"), + Series([pd.NA, True], dtype="boolean"), + ], +) +def test_map_with_pd_na_output(ser): + func_return_value_and_na = lambda x: x if x is pd.NA else ser.dtype.type(2 * x) + result = ser.map(func_return_value_and_na) + expected = Series( + [func_return_value_and_na(ser[0]), func_return_value_and_na(ser[1])], + dtype=ser.dtype, + ) tm.assert_series_equal(result, expected) From a206c9486f4f4e4527829f9b3f0c41033b7eaeac Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 20:30:49 +0200 Subject: [PATCH 024/163] convert to nullable dtype only if there are nullable value --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6e56c0a254825..0dd847a6521ba 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1704,9 +1704,9 @@ def map_array( mapper, mask=mask, na_value=na_value, - convert_to_nullable_dtype=True, + convert_to_nullable_dtype=na_value is not None, ) else: return lib.map_infer_mask( - values, mapper, mask=mask, convert_to_nullable_dtype=True + values, mapper, mask=mask, convert_to_nullable_dtype=na_value is not None ) From b36b5813bb725df63b7785a14493c1b740cbabc3 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 21:05:55 +0200 Subject: [PATCH 025/163] Manage date and time dtype pyarrow as object --- pandas/core/algorithms.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0dd847a6521ba..8ec9f8cb0b3f4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1684,6 +1684,7 @@ def map_array( na_value = None mask = isna(arr) + storage = None if isinstance(arr.dtype, BaseMaskedDtype) and na_action is None: arr = cast("BaseMaskedArray", arr) values = arr._data @@ -1691,7 +1692,15 @@ def map_array( na_value = arr.dtype.na_value elif isinstance(arr.dtype, ExtensionDtype) and na_action is None: arr = cast("ExtensionArray", arr) - values = np.asarray(arr) + arr_dtype = arr.dtype.__repr__() + if "pyarrow" in arr_dtype: + if "date" in arr_dtype or "time" in arr_dtype: + values = arr.astype(object, copy=False) + else: + values = arr._pa_array.to_numpy() + storage = "pyarrow" + else: + values = np.asarray(arr) if arr._hasna: na_value = arr.dtype.na_value else: @@ -1705,6 +1714,7 @@ def map_array( mask=mask, na_value=na_value, convert_to_nullable_dtype=na_value is not None, + storage=storage, ) else: return lib.map_infer_mask( From 637570132acaa98292d2644834c745726906cab9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 21:35:59 +0200 Subject: [PATCH 026/163] Manage pyarrow string --- pandas/core/arrays/arrow/array.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index cf731fdad8a07..c01c6622252e4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -338,7 +338,12 @@ def _from_sequence_of_strings( elif pa.types.is_date(pa_type): from pandas.core.tools.datetimes import to_datetime - scalars = to_datetime(strings, errors="raise").date + if isinstance(strings, ExtensionArray) and isinstance( + strings.dtype, ArrowDtype + ): + scalars = to_datetime(strings._pa_array, errors="raise").date + else: + scalars = to_datetime(strings, errors="raise").date elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta @@ -1424,7 +1429,10 @@ def to_numpy( def map(self, mapper, na_action: Literal["ignore"] | None = None): result = super().map(mapper, na_action) - return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) + if isinstance(result.dtype, StringDtype): + return result + else: + return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) @doc(ExtensionArray.duplicated) def duplicated( From eda77021294693e0ada4be6a12ad612f2109566d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:06:59 +0200 Subject: [PATCH 027/163] Manage pyarrow string --- pandas/_libs/lib.pyi | 5 +++++ pandas/_libs/lib.pyx | 22 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 909ac8100b75f..874b565a69b1a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -77,6 +77,7 @@ def map_infer( mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., convert_to_nullable_dtype: Literal[False] = ..., + storage: str | None = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -88,6 +89,7 @@ def map_infer( mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., convert_to_nullable_dtype: Literal[False] = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -99,6 +101,7 @@ def maybe_convert_objects( convert_non_numeric: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload def maybe_convert_objects( @@ -110,6 +113,7 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -121,6 +125,7 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_numeric( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cd4d14dac20c4..36f40587b9a52 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2454,7 +2454,8 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, - object dtype_if_all_nat=None) -> "ArrayLike": + object dtype_if_all_nat=None, + str storage=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2477,6 +2478,8 @@ def maybe_convert_objects(ndarray[object] objects, Whether to convert datetime, timedelta, period, interval types. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. + storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + Backend storage Returns ------- @@ -2495,6 +2498,9 @@ def maybe_convert_objects(ndarray[object] objects, object val float64_t fnan = NaN + if storage is None: + storage="pyarrow_numpy" + if dtype_if_all_nat is not None: # in practice we don't expect to ever pass dtype_if_all_nat # without both convert_non_numeric, so disallow @@ -2699,10 +2705,13 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if ( + (using_pyarrow_string_dtype() or storage == "pyarrow") + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): @@ -2937,6 +2946,7 @@ def map_infer( const uint8_t[:] mask=None, object na_value=None, bint convert_to_nullable_dtype=False, + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2955,6 +2965,8 @@ def map_infer( convert_to_nullable_dtype : bool, default False If an array-like object contains only integer or boolean values (and NaN) is encountered, whether to convert and return an Boolean/IntegerArray. + storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + Backend storage Returns ------- @@ -2985,7 +2997,9 @@ def map_infer( if convert: return maybe_convert_objects( result, - convert_to_nullable_dtype=convert_to_nullable_dtype + convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=True, + storage=storage, ) else: return result From 9ab66021b21404706144fd1fc1c113e08d84cf19 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:09:02 +0200 Subject: [PATCH 028/163] Manage BasedMaskedArray --- pandas/core/arrays/masked.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 483ddcdccf274..2fcb5db07327f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1319,7 +1319,8 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - return map_array(self, mapper, na_action=na_action) + result = map_array(self, mapper, na_action=na_action) + return type(self)._from_sequence(result) @overload def any( From 0da8920bc0068f4f4369e4acc3a57afc2393ce1e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:10:10 +0200 Subject: [PATCH 029/163] Test directly ExtensionArray --- pandas/tests/extension/base/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b951d4c35d208..829d16a99ac42 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -100,8 +100,8 @@ def test_apply_simple_series(self, data): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() From e8bce2960e6a8a4fbe9dcd927aa98e90ade65012 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:12:07 +0200 Subject: [PATCH 030/163] pyarrow data keep their original type if possible --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4b37b65d5404c..5f8c833308d5b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3505,5 +3505,5 @@ def test_cast_dictionary_different_value_dtype(arrow_type): def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + expected = pd.Series([42.0, 42.0, np.nan], dtype=ser.dtype) tm.assert_series_equal(result, expected) From 14e89737bdd5a42ab4d1dbd24e1b9baff12423bc Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:14:21 +0200 Subject: [PATCH 031/163] if map return only pd.NA values their type is double pyarrow --- pandas/tests/series/methods/test_map.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 3f7db9d8bd9bb..caf5e180a6cda 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -236,7 +236,15 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - expected = Series(np.nan, index=s.index) + na_value = np.nan + dtype = "float64" + + # In pyarrow double is the equivalent of float64 + # Cf: https://arrow.apache.org/docs/python/pandas.html#pandas-arrow-conversion + if "pyarrow" in s.dtype.__repr__(): + dtype = "double[pyarrow]" + na_value = pd.NA + expected = Series(na_value, index=s.index, dtype=dtype) tm.assert_series_equal(result, expected) @@ -287,6 +295,8 @@ def test_map_with_pd_na_input(ser): Series([pd.NA, 11], dtype="Int64"), Series([pd.NA, 11.0], dtype="Float64"), Series([pd.NA, True], dtype="boolean"), + Series([pd.NA, "AAA"], dtype="string"), + Series([pd.NA, "AAA"], dtype="string[pyarrow]"), ], ) def test_map_with_pd_na_output(ser): From 5e3ad280df04f452132d7382e31d03777f5c287e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:34:48 +0200 Subject: [PATCH 032/163] Add storage to map_infer_mask --- pandas/_libs/lib.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 874b565a69b1a..27a40c62cdecb 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -190,6 +190,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + storage: str | None = ..., ) -> np.ndarray: ... @overload def map_infer_mask( @@ -201,6 +202,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + storage: str | None = ..., ) -> ArrayLike: ... def indices_fast( index: npt.NDArray[np.intp], From c9dd068641b6c85c1d85911c24516924e005a1e0 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:35:28 +0200 Subject: [PATCH 033/163] Add storage to map_infer_mask --- pandas/_libs/lib.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 36f40587b9a52..11ebd01e3740e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2874,7 +2874,8 @@ def map_infer_mask( bint convert=True, object na_value=no_default, bint convert_to_nullable_dtype=False, - cnp.dtype dtype=np.dtype(object) + cnp.dtype dtype=np.dtype(object), + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2892,6 +2893,8 @@ def map_infer_mask( input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. + storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + Backend storage Returns ------- @@ -2929,7 +2932,8 @@ def map_infer_mask( if convert: return maybe_convert_objects( result, - convert_to_nullable_dtype=convert_to_nullable_dtype + convert_to_nullable_dtype=convert_to_nullable_dtype, + storage=storage, ) else: return result From d1b6a281d703bc2698efb61a40880ad273159406 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:36:37 +0200 Subject: [PATCH 034/163] Add empty dict as NA value for JSONArray extension --- pandas/_libs/missing.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2f44128cda822..58843356f753e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -148,6 +148,7 @@ cpdef bint checknull(object val): - np.timedelta64 representation of NaT - NA - Decimal("NaN") + - {} empty dict Parameters ---------- @@ -157,7 +158,7 @@ cpdef bint checknull(object val): ------- bool """ - if val is None or val is NaT or val is C_NA: + if val is None or val is NaT or val is C_NA or val == {}: return True elif util.is_float_object(val) or util.is_complex_object(val): if val != val: @@ -191,6 +192,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): - np.timedelta64 representation of NaT - NA - Decimal("NaN") + - {} empty dict Parameters ---------- From 3ccc4fd7662626b5d58ae89ebb68d5030e0105bd Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:50:41 +0200 Subject: [PATCH 035/163] Add storage parameter to map_infer_mask --- pandas/core/algorithms.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8ec9f8cb0b3f4..e0ca2a38c3fc8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,7 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -1685,16 +1686,18 @@ def map_array( na_value = None mask = isna(arr) storage = None - if isinstance(arr.dtype, BaseMaskedDtype) and na_action is None: + if isinstance(arr.dtype, BaseMaskedDtype): arr = cast("BaseMaskedArray", arr) values = arr._data if arr._hasna: na_value = arr.dtype.na_value - elif isinstance(arr.dtype, ExtensionDtype) and na_action is None: + elif isinstance(arr.dtype, ExtensionDtype): arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() if "pyarrow" in arr_dtype: - if "date" in arr_dtype or "time" in arr_dtype: + if any( + time_type in arr_dtype for time_type in ["date", "time", "duration"] + ): values = arr.astype(object, copy=False) else: values = arr._pa_array.to_numpy() @@ -1713,10 +1716,14 @@ def map_array( mapper, mask=mask, na_value=na_value, - convert_to_nullable_dtype=na_value is not None, + convert_to_nullable_dtype=na_value is NA, storage=storage, ) else: return lib.map_infer_mask( - values, mapper, mask=mask, convert_to_nullable_dtype=na_value is not None + values, + mapper, + mask=mask, + convert_to_nullable_dtype=na_value is NA, + storage=storage, ) From 996d99a8588aad5769ca2de8db6c3d51ef6ccf1b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:52:34 +0200 Subject: [PATCH 036/163] Cast result to an extension array --- pandas/core/arrays/arrow/array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8e7a0dfafaf06..97b7d164802c9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -62,6 +62,7 @@ from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com +from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -1431,7 +1432,7 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): if isinstance(result.dtype, StringDtype): return result else: - return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) + return pd_array(result, dtype=result.dtype) @doc(ExtensionArray.duplicated) def duplicated( From a60b23ab96e32b9d734acdcb4791879e9633f8de Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:54:21 +0200 Subject: [PATCH 037/163] Cast result to a NumpyExtensionArray an extension array --- pandas/core/arrays/base.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f83fdcd46b371..3a767ff1c1b64 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -47,7 +47,10 @@ is_scalar, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + NumpyEADtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, @@ -70,6 +73,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.construction import array as pd_array from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, @@ -2336,7 +2340,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the function returns a tuple with more than one element a MultiIndex will be returned. """ - return map_array(self, mapper, na_action=na_action) + result = map_array(self, mapper, na_action=na_action) + if isinstance(self.dtype, NumpyEADtype): + return pd_array(result, dtype=NumpyEADtype(result.dtype)) + elif isinstance(result, np.ndarray): + return pd_array(result) + else: + return result # ------------------------------------------------------------------------ # GroupBy Methods From 505bdecc30a1dd4078c8bc27b4edc9d24c1fa795 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:55:07 +0200 Subject: [PATCH 038/163] Cast result to an extension array --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2fcb5db07327f..d3ee05c6b4611 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1320,7 +1320,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): def map(self, mapper, na_action: Literal["ignore"] | None = None): result = map_array(self, mapper, na_action=na_action) - return type(self)._from_sequence(result) + return pd_array(result, dtype=result.dtype) @overload def any( From 17f46c2670efcad3fe88f3bf3b107bf4c5a9c92c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:55:42 +0200 Subject: [PATCH 039/163] Remove dtype test --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 829d16a99ac42..c4f641b13913a 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -101,7 +101,7 @@ def test_apply_simple_series(self, data): def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected, check_dtype=False) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() From fa9a2f2cf45fc18c8c70395fb310dd20120254ae Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:23:57 +0200 Subject: [PATCH 040/163] Take into account UserDict in checknull --- pandas/_libs/missing.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 58843356f753e..7369037387b9c 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,3 +1,4 @@ +from collections import UserDict from decimal import Decimal import numbers from sys import maxsize @@ -148,7 +149,7 @@ cpdef bint checknull(object val): - np.timedelta64 representation of NaT - NA - Decimal("NaN") - - {} empty dict + - {} empty dict or UserDict Parameters ---------- @@ -158,7 +159,12 @@ cpdef bint checknull(object val): ------- bool """ - if val is None or val is NaT or val is C_NA or val == {}: + if ( + val is None + or val is NaT + or val is C_NA + or (isinstance(val, (dict, UserDict)) and not val) + ): return True elif util.is_float_object(val) or util.is_complex_object(val): if val != val: @@ -192,7 +198,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): - np.timedelta64 representation of NaT - NA - Decimal("NaN") - - {} empty dict + - {} empty dict or UserDict Parameters ---------- From 92ed4efeaa12d72987e83f14fa92280e177ca11a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:24:29 +0200 Subject: [PATCH 041/163] Take into na_value in in map_infer_mask --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e0ca2a38c3fc8..d96ef0384eeaa 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1724,6 +1724,7 @@ def map_array( values, mapper, mask=mask, + na_value=na_value, convert_to_nullable_dtype=na_value is NA, storage=storage, ) From ff28d74c32729a8b0c0565f5ee46b5f09e329d24 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 15:01:57 +0200 Subject: [PATCH 042/163] Manage IntervalDtype --- pandas/core/arrays/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 57d5523b83be8..6159f11688244 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,6 +48,7 @@ ) from pandas.core.dtypes.dtypes import ( ExtensionDtype, + IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -2343,6 +2344,8 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): result = map_array(self, mapper, na_action=na_action) if isinstance(self.dtype, NumpyEADtype): return pd_array(result, dtype=NumpyEADtype(result.dtype)) + if isinstance(self.dtype, IntervalDtype): + return result elif isinstance(result, np.ndarray): return pd_array(result) else: From ee088d4b4626ea2eea4ab8b0561bf1b337f268a9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:05:33 +0200 Subject: [PATCH 043/163] Manage ArrowDType int64 --- pandas/_libs/lib.pyx | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 11ebd01e3740e..c33d8376d50ac 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2824,12 +2824,19 @@ def maybe_convert_objects(ndarray[object] objects, # TODO: do these after the itemsize check? if (result is ints or result is uints) and convert_to_nullable_dtype: - from pandas.core.arrays import IntegerArray + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype + + dtype = ArrowDtype(pa.int64()) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + else: + from pandas.core.arrays import IntegerArray + + # Set these values to 1 to be deterministic, match + # IntegerDtype._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) - # Set these values to 1 to be deterministic, match - # IntegerDtype._internal_fill_value - result[mask] = 1 - result = IntegerArray(result, mask) elif result is floats and convert_to_nullable_dtype: from pandas.core.arrays import FloatingArray From 0067edfad23a83477bcc8e0b657df2f5253dd173 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 23:55:05 +0200 Subject: [PATCH 044/163] Correct error in empty mapper management --- pandas/core/algorithms.py | 7 +++++-- pandas/tests/series/methods/test_map.py | 15 ++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d96ef0384eeaa..8dae1b2166697 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1665,7 +1665,10 @@ def map_array( from pandas import Series if len(mapper) == 0: - mapper = Series(mapper, dtype=np.float64) + if is_extension_array_dtype(arr.dtype) and arr.dtype.na_value is NA: + mapper = Series(mapper, dtype=arr.dtype) + else: + mapper = Series(mapper, dtype=np.float64) else: mapper = Series(mapper) @@ -1683,7 +1686,7 @@ def map_array( if not len(arr): return arr.copy() - na_value = None + na_value = np.nan mask = isna(arr) storage = None if isinstance(arr.dtype, BaseMaskedDtype): diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index caf5e180a6cda..978b7cfadf341 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + import pandas as pd from pandas import ( DataFrame, @@ -236,14 +238,13 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - na_value = np.nan - dtype = "float64" + if is_extension_array_dtype(s.dtype) and s.dtype.na_value is pd.NA: + na_value = s.dtype.na_value + dtype = s.dtype + else: + na_value = np.nan + dtype = "float64" - # In pyarrow double is the equivalent of float64 - # Cf: https://arrow.apache.org/docs/python/pandas.html#pandas-arrow-conversion - if "pyarrow" in s.dtype.__repr__(): - dtype = "double[pyarrow]" - na_value = pd.NA expected = Series(na_value, index=s.index, dtype=dtype) tm.assert_series_equal(result, expected) From e94997a076e7ab8a4eea47b71f46781d2a5eeb6f Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 2 Jul 2024 21:47:12 +0200 Subject: [PATCH 045/163] Manage IntervalDtype --- pandas/core/algorithms.py | 7 ++++++- pandas/core/arrays/base.py | 12 ++++++------ pandas/tests/series/methods/test_map.py | 8 ++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8dae1b2166697..0bbeb7b4771ed 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -62,6 +62,7 @@ BaseMaskedDtype, CategoricalDtype, ExtensionDtype, + IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -1665,7 +1666,11 @@ def map_array( from pandas import Series if len(mapper) == 0: - if is_extension_array_dtype(arr.dtype) and arr.dtype.na_value is NA: + if ( + is_extension_array_dtype(arr.dtype) + and not isinstance(arr.dtype, IntervalDtype) + and arr.dtype.na_value is NA + ): mapper = Series(mapper, dtype=arr.dtype) else: mapper = Series(mapper, dtype=np.float64) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6159f11688244..a6a868e8f71f1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,7 +48,6 @@ ) from pandas.core.dtypes.dtypes import ( ExtensionDtype, - IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -2342,12 +2341,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): a MultiIndex will be returned. """ result = map_array(self, mapper, na_action=na_action) - if isinstance(self.dtype, NumpyEADtype): - return pd_array(result, dtype=NumpyEADtype(result.dtype)) - if isinstance(self.dtype, IntervalDtype): - return result + if isinstance(result, ExtensionArray): + if isinstance(self.dtype, NumpyEADtype): + return pd_array(result, dtype=NumpyEADtype(result.dtype)) + else: + return result elif isinstance(result, np.ndarray): - return pd_array(result) + return pd_array(result, result.dtype) else: return result diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 978b7cfadf341..7909a6a4eff7f 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -9,6 +9,7 @@ import pytest from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import IntervalDtype import pandas as pd from pandas import ( @@ -237,8 +238,11 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - - if is_extension_array_dtype(s.dtype) and s.dtype.na_value is pd.NA: + if ( + is_extension_array_dtype(s.dtype) + and not isinstance(s.dtype, IntervalDtype) + and s.dtype.na_value is pd.NA + ): na_value = s.dtype.na_value dtype = s.dtype else: From f93dc668698057977d8938679f1827797e1392c8 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 14 Jul 2024 18:00:06 +0200 Subject: [PATCH 046/163] Try to manage date with pyarrow --- pandas/_libs/lib.pyx | 74 ++++++++++++++++++++++------ pandas/core/algorithms.py | 6 +-- pandas/tests/extension/test_arrow.py | 2 +- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c33d8376d50ac..eac8abe1251a9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1300,6 +1300,7 @@ cdef class Seen: bint object_ # seen_object bint complex_ # seen_complex bint datetime_ # seen_datetime + bint date_ # seen_date bint coerce_numeric # coerce data to numeric bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz @@ -1328,6 +1329,7 @@ cdef class Seen: self.object_ = False self.complex_ = False self.datetime_ = False + self.date_ = False self.timedelta_ = False self.datetimetz_ = False self.period_ = False @@ -2613,6 +2615,13 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif PyDate_Check(val): + if convert_non_numeric: + seen.date_ = True + break + else: + seen.object_ = True + break elif is_period_object(val): if convert_non_numeric: seen.period_ = True @@ -2656,21 +2665,46 @@ def maybe_convert_objects(ndarray[object] objects, # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if is_datetime_with_singletz_array(objects): - from pandas import DatetimeIndex + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype - try: - dti = DatetimeIndex(objects) - except OutOfBoundsDatetime: - # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds - pass + if isinstance(val, datetime): + objects[mask] = None else: - # unbox to DatetimeArray - return dti._data - seen.object_ = True + objects[mask] = np.datetime64("NaT") + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + else: + if is_datetime_with_singletz_array(objects): + from pandas import DatetimeIndex + + try: + dti = DatetimeIndex(objects) + except OutOfBoundsDatetime: + # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds + pass + else: + # unbox to DatetimeArray + return dti._data + seen.object_ = True elif seen.datetime_: - if is_datetime_or_datetime64_array(objects): + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype + + if isinstance(val, datetime): + objects[mask] = None + else: + objects[mask] = np.datetime64("NaT") + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + elif is_datetime_or_datetime64_array(objects): from pandas import DatetimeIndex try: @@ -2682,6 +2716,16 @@ def maybe_convert_objects(ndarray[object] objects, return dti._data._ndarray seen.object_ = True + elif seen.date_: + if storage == "pyarrow": + + from pandas.core.dtypes.dtypes import ArrowDtype + + objects[mask] = None + pa_array = pa.array(objects) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + elif seen.timedelta_: if is_timedelta_or_timedelta64_array(objects): from pandas import TimedeltaIndex @@ -2914,17 +2958,16 @@ def map_infer_mask( ndarray result = np.empty(n, dtype=dtype) - flatiter arr_it = PyArray_IterNew(arr) flatiter result_it = PyArray_IterNew(result) for i in range(n): if mask[i]: if na_value is no_default: - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = arr[i] else: val = na_value else: - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = arr[i] val = f(val) if cnp.PyArray_IsZeroDim(val): @@ -2932,14 +2975,13 @@ def map_infer_mask( val = val.item() PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) - - PyArray_ITER_NEXT(arr_it) PyArray_ITER_NEXT(result_it) if convert: return maybe_convert_objects( result, convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=True, storage=storage, ) else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0bbeb7b4771ed..579f65bf6e8bd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1703,12 +1703,10 @@ def map_array( arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() if "pyarrow" in arr_dtype: - if any( - time_type in arr_dtype for time_type in ["date", "time", "duration"] - ): + if any(time_type in arr_dtype for time_type in ["duration"]): values = arr.astype(object, copy=False) else: - values = arr._pa_array.to_numpy() + values = np.asarray(arr) storage = "pyarrow" else: values = np.asarray(arr) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7b58782a56d4b..0b60a7e78b9fd 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -279,7 +279,7 @@ def test_map(self, data_missing, na_action): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing From 963f99a64a18f6dc63233fe7aede0ac2e5bbd4bc Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 20:39:37 +0000 Subject: [PATCH 047/163] Manage timedelta, datetimetz and date --- pandas/_libs/lib.pyx | 49 +++++++++++++++++++++++++++++---------- pandas/core/algorithms.py | 8 +++---- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index eac8abe1251a9..85da215349eef 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -96,6 +96,9 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) +from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -2615,7 +2618,10 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break - elif PyDate_Check(val): + elif ( + PyDate_Check(val) + or (pa is not None and isinstance(val, (pa.Date32Scalar, pa.Date64Scalar))) + ): if convert_non_numeric: seen.date_ = True break @@ -2668,12 +2674,16 @@ def maybe_convert_objects(ndarray[object] objects, if storage == "pyarrow": from pandas.core.dtypes.dtypes import ArrowDtype + datetime64_array = None if isinstance(val, datetime): objects[mask] = None + datetime64_array = objects.astype(Timestamp) else: objects[mask] = np.datetime64("NaT") - datetime64_array = objects.astype(val.dtype) - pa_array = pa.array(datetime64_array) + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array).cast( + pa.timestamp(val.resolution.unit, val.tzinfo) + ) dtype = ArrowDtype(pa_array.type) return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) @@ -2727,17 +2737,32 @@ def maybe_convert_objects(ndarray[object] objects, return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) elif seen.timedelta_: - if is_timedelta_or_timedelta64_array(objects): - from pandas import TimedeltaIndex + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype - try: - tdi = TimedeltaIndex(objects) - except OutOfBoundsTimedelta: - pass + timedelta64_array = None + if isinstance(val, timedelta): + objects[mask] = None + timedelta64_array = objects.astype(Timedelta) else: - # unbox to ndarray[timedelta64[ns]] - return tdi._data._ndarray - seen.object_ = True + objects[mask] = np.timedelta64("NaT") + timedelta64_array = objects.astype(val.dtype) + pa_array = pa.array(timedelta64_array) + + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + else: + if is_timedelta_or_timedelta64_array(objects): + from pandas import TimedeltaIndex + + try: + tdi = TimedeltaIndex(objects) + except OutOfBoundsTimedelta: + pass + else: + # unbox to ndarray[timedelta64[ns]] + return tdi._data._ndarray + seen.object_ = True elif seen.period_: if is_period_array(objects): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 579f65bf6e8bd..824af26f7a2ad 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1703,13 +1703,11 @@ def map_array( arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() if "pyarrow" in arr_dtype: - if any(time_type in arr_dtype for time_type in ["duration"]): - values = arr.astype(object, copy=False) + storage = "pyarrow" + if "date" in arr_dtype: + values = np.fromiter(arr._pa_array, dtype='O') else: values = np.asarray(arr) - storage = "pyarrow" - else: - values = np.asarray(arr) if arr._hasna: na_value = arr.dtype.na_value else: From f8deed67e1306594a5912ea10dfd460228874c0f Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 20:57:46 +0000 Subject: [PATCH 048/163] pylint fix --- pandas/_libs/lib.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 85da215349eef..1bd6ae8221ecb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -96,8 +96,9 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) -from pandas._libs.tslibs.timestamps import Timestamp + from pandas._libs.tslibs.timedeltas import Timedelta +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( From b90af02986f1bf48da0eae7da627cb12f70d994d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 21:04:14 +0000 Subject: [PATCH 049/163] Code simplification --- pandas/tests/series/methods/test_map.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 7909a6a4eff7f..5ed2aeabc95a7 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -301,7 +301,6 @@ def test_map_with_pd_na_input(ser): Series([pd.NA, 11.0], dtype="Float64"), Series([pd.NA, True], dtype="boolean"), Series([pd.NA, "AAA"], dtype="string"), - Series([pd.NA, "AAA"], dtype="string[pyarrow]"), ], ) def test_map_with_pd_na_output(ser): From 26a6fb79fb477e5382c0be6439d547c4a730012c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 21:15:27 +0000 Subject: [PATCH 050/163] Correct values initialization problem --- pandas/core/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 36ddb99bea031..4c199d8f604b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1720,9 +1720,11 @@ def map_array( if "pyarrow" in arr_dtype: storage = "pyarrow" if "date" in arr_dtype: - values = np.fromiter(arr._pa_array, dtype='O') + values = np.fromiter(arr._pa_array, dtype="O") else: values = np.asarray(arr) + else: + values = np.asarray(arr) if arr._hasna: na_value = arr.dtype.na_value else: From fa46a96e9c7243356d58caf55372e9e7fe33d95d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 26 Jul 2024 17:52:56 +0000 Subject: [PATCH 051/163] Manage pyarrow and python storage --- pandas/_libs/lib.pyx | 19 ++++++++----------- pandas/core/algorithms.py | 5 ++++- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1bd6ae8221ecb..0518e32f4f226 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2484,7 +2484,7 @@ def maybe_convert_objects(ndarray[object] objects, Whether to convert datetime, timedelta, period, interval types. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. - storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" Backend storage Returns @@ -2505,7 +2505,7 @@ def maybe_convert_objects(ndarray[object] objects, float64_t fnan = NaN if storage is None: - storage="pyarrow_numpy" + storage="python" if dtype_if_all_nat is not None: # in practice we don't expect to ever pass dtype_if_all_nat @@ -2775,19 +2775,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if ( - (using_pyarrow_string_dtype() or storage == "pyarrow") - and is_string_array(objects, skipna=True) - ): + if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage=storage) + dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif storage == "pyarrow" or storage == "python": from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True @@ -2970,7 +2967,7 @@ def map_infer_mask( input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. - storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" Backend storage Returns @@ -3044,7 +3041,7 @@ def map_infer( convert_to_nullable_dtype : bool, default False If an array-like object contains only integer or boolean values (and NaN) is encountered, whether to convert and return an Boolean/IntegerArray. - storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" Backend storage Returns diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4c199d8f604b6..2642195be53d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1717,7 +1717,10 @@ def map_array( elif isinstance(arr.dtype, ExtensionDtype): arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() - if "pyarrow" in arr_dtype: + if "python" in arr_dtype: + storage = "python" + values = np.asarray(arr) + elif "pyarrow" in arr_dtype: storage = "pyarrow" if "date" in arr_dtype: values = np.fromiter(arr._pa_array, dtype="O") From d6264e61fbb86ec88768e32289667ffd66f91a32 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 28 Jul 2024 05:54:04 +0000 Subject: [PATCH 052/163] Manage pyarrow and python storage in map dict like --- pandas/_libs/lib.pyx | 9 ++--- pandas/core/algorithms.py | 79 +++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0518e32f4f226..7a5a2bccb7b6f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2484,7 +2484,7 @@ def maybe_convert_objects(ndarray[object] objects, Whether to convert datetime, timedelta, period, interval types. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. - storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None Backend storage Returns @@ -2504,9 +2504,6 @@ def maybe_convert_objects(ndarray[object] objects, object val float64_t fnan = NaN - if storage is None: - storage="python" - if dtype_if_all_nat is not None: # in practice we don't expect to ever pass dtype_if_all_nat # without both convert_non_numeric, so disallow @@ -2967,7 +2964,7 @@ def map_infer_mask( input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. - storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None Backend storage Returns @@ -3041,7 +3038,7 @@ def map_infer( convert_to_nullable_dtype : bool, default False If an array-like object contains only integer or boolean values (and NaN) is encountered, whether to convert and return an Boolean/IntegerArray. - storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None Backend storage Returns diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2642195be53d8..a5d77770367ce 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1690,7 +1690,10 @@ def map_array( else: mapper = Series(mapper, dtype=np.float64) else: - mapper = Series(mapper) + if arr.dtype in ("string[pyarrow]", "string[python]"): + mapper = Series(mapper, dtype=arr.dtype) + else: + mapper = Series(mapper) if isinstance(mapper, ABCSeries): if na_action == "ignore": @@ -1706,33 +1709,7 @@ def map_array( if not len(arr): return arr.copy() - na_value = np.nan - mask = isna(arr) - storage = None - if isinstance(arr.dtype, BaseMaskedDtype): - arr = cast("BaseMaskedArray", arr) - values = arr._data - if arr._hasna: - na_value = arr.dtype.na_value - elif isinstance(arr.dtype, ExtensionDtype): - arr = cast("ExtensionArray", arr) - arr_dtype = arr.dtype.__repr__() - if "python" in arr_dtype: - storage = "python" - values = np.asarray(arr) - elif "pyarrow" in arr_dtype: - storage = "pyarrow" - if "date" in arr_dtype: - values = np.fromiter(arr._pa_array, dtype="O") - else: - values = np.asarray(arr) - else: - values = np.asarray(arr) - if arr._hasna: - na_value = arr.dtype.na_value - else: - # we must convert to python types - values = arr.astype(object, copy=False) + mask, na_value, storage, values = _build_map_infer_methods_params(arr) if na_action is None: return lib.map_infer( @@ -1752,3 +1729,49 @@ def map_array( convert_to_nullable_dtype=na_value is NA, storage=storage, ) + + +def _build_map_infer_methods_params(arr: ArrayLike): + """ + Process lib.map_infer and lib.map_infer_mask parameters from an array `arr` + + Parameters + ---------- + arr + + Returns + ------- + mask : np.ndarray[bool] + na_value : object + A value in `values` to consider missing. + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" + Backend storage + values : np.ndarray + Values to be processed by lib.map_infer and lib.map_infer_mask + + """ + na_value = np.nan + mask = isna(arr) + storage = "python" + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + values = arr._data + if arr._hasna: + na_value = arr.dtype.na_value + + elif isinstance(arr.dtype, ExtensionDtype): + arr = cast("ExtensionArray", arr) + arr_dtype = arr.dtype.__repr__() + if "pyarrow" in arr_dtype and "date" in arr_dtype: + values = np.fromiter(arr._pa_array, dtype="O") + else: + values = np.asarray(arr) + if "pyarrow" in arr_dtype: + storage = "pyarrow" + if arr._hasna: + na_value = arr.dtype.na_value + + else: + # we must convert to python types + values = arr.astype(object, copy=False) + return mask, na_value, storage, values From 237926da5793c4a4478c49d00ebb1068109c64b9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 28 Jul 2024 07:19:08 +0000 Subject: [PATCH 053/163] Correct wrong default storage type --- pandas/core/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a5d77770367ce..1a1eb12d86a6a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1752,7 +1752,7 @@ def _build_map_infer_methods_params(arr: ArrayLike): """ na_value = np.nan mask = isna(arr) - storage = "python" + storage = None if isinstance(arr.dtype, BaseMaskedDtype): arr = cast("BaseMaskedArray", arr) values = arr._data @@ -1768,6 +1768,8 @@ def _build_map_infer_methods_params(arr: ArrayLike): values = np.asarray(arr) if "pyarrow" in arr_dtype: storage = "pyarrow" + if "python" in arr_dtype: + storage = "python" if arr._hasna: na_value = arr.dtype.na_value From 0fc4b6007a4e20dbaa8e0edf3157b62ec59cdce1 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 1 Aug 2024 07:39:37 +0000 Subject: [PATCH 054/163] Add convert_non_numeric as map_infer_mask parameter --- pandas/_libs/lib.pyi | 2 ++ pandas/_libs/lib.pyx | 5 ++++- pandas/core/algorithms.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 27a40c62cdecb..eaf56c5ab652d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -190,6 +190,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + convert_non_numeric: bool = ..., storage: str | None = ..., ) -> np.ndarray: ... @overload @@ -202,6 +203,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + convert_non_numeric: bool = ..., storage: str | None = ..., ) -> ArrayLike: ... def indices_fast( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cf6ce0d08cd82..566d093d54e95 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2945,6 +2945,7 @@ def map_infer_mask( bint convert=True, object na_value=no_default, bint convert_to_nullable_dtype=False, + convert_non_numeric=False, cnp.dtype dtype=np.dtype(object), str storage=None, ) -> "ArrayLike": @@ -2962,6 +2963,8 @@ def map_infer_mask( na_value : Any, optional The result value to use for masked values. By default, the input value is used. + convert_non_numeric : bool, default False + Whether to convert datetime, timedelta, period, interval types. dtype : numpy.dtype The numpy dtype to use for the result ndarray. storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None @@ -3001,7 +3004,7 @@ def map_infer_mask( return maybe_convert_objects( result, convert_to_nullable_dtype=convert_to_nullable_dtype, - convert_non_numeric=True, + convert_non_numeric=convert_non_numeric, storage=storage, ) else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1a1eb12d86a6a..9c9d851830bab 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1727,6 +1727,7 @@ def map_array( mask=mask, na_value=na_value, convert_to_nullable_dtype=na_value is NA, + convert_non_numeric=True, storage=storage, ) From 6b5c8db37d4fd4daf04ed8e8b928adaecfc6aa65 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 07:57:26 +0000 Subject: [PATCH 055/163] pyarrow data are sent to map_infer as iterator --- pandas/core/algorithms.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9c9d851830bab..3a0ddf2a0b944 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1763,12 +1763,11 @@ def _build_map_infer_methods_params(arr: ArrayLike): elif isinstance(arr.dtype, ExtensionDtype): arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() - if "pyarrow" in arr_dtype and "date" in arr_dtype: + if "pyarrow" in arr_dtype: + storage = "pyarrow" values = np.fromiter(arr._pa_array, dtype="O") else: values = np.asarray(arr) - if "pyarrow" in arr_dtype: - storage = "pyarrow" if "python" in arr_dtype: storage = "python" if arr._hasna: From 8578b1ef376805f4f2c4466ece5f264df532ca6d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:00:19 +0000 Subject: [PATCH 056/163] Add method _maybe_convert_pyarrow_objects --- pandas/_libs/lib.pyx | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 566d093d54e95..e943f93f27c00 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2451,6 +2451,20 @@ def maybe_convert_numeric( return (ints, None) +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_convert_pyarrow_objects( + ndarray[object] objects, + ndarray[uint8_t] mask, + Seen seen) -> "ArrayLike": + from pandas.core.dtypes.dtypes import ArrowDtype + + objects[mask] = None + pa_array = pa.array(objects) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2666,6 +2680,8 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + if storage == "pyarrow": + return _maybe_convert_pyarrow_objects(objects, mask, seen) # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: From 3fb8b0de4ed97f7c8189ffbc109fdbcd3f9bde12 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:00:50 +0000 Subject: [PATCH 057/163] Remove check_dtype --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1d0bb5c080015..5dad77b1a10f5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -288,7 +288,7 @@ def test_map(self, data_missing, na_action): else: result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype From b7de2924dba4bc3cb0f76a854372b1c5514f7143 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:01:36 +0000 Subject: [PATCH 058/163] Code simplification --- pandas/core/arrays/arrow/array.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7afaf8c635f12..3e9c435ad5828 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -61,7 +61,6 @@ from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com -from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -1428,11 +1427,7 @@ def to_numpy( return result def map(self, mapper, na_action: Literal["ignore"] | None = None): - result = super().map(mapper, na_action) - if isinstance(result.dtype, StringDtype): - return result - else: - return pd_array(result, dtype=result.dtype) + return super().map(mapper, na_action) @doc(ExtensionArray.duplicated) def duplicated( From a42048f0c499f72d6d6becdf6a90979de33f5989 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:19:03 +0000 Subject: [PATCH 059/163] Manage default storage value --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e943f93f27c00..3cf312a19d072 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2794,7 +2794,7 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif storage == "pyarrow" or storage == "python": + elif storage is None or storage == "python": from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage=storage) From 4c61857c6d65ac4dc56cb9da82aaf829a237dd24 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 09:39:55 +0000 Subject: [PATCH 060/163] ord(x) return a TypeError if x is a pyarrow.lib.LargeStringScalar --- pandas/tests/reshape/merge/test_multi.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 7ae2fffa04205..45d36ae886d96 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -98,8 +98,14 @@ def test_left_join_multi_index(self, sort, infer_string): with option_context("future.infer_string", infer_string): icols = ["1st", "2nd", "3rd"] + def ord_func(x): + if infer_string: + # ord(x) return a TypeError if x is a pyarrow.lib.LargeStringScalar + return ord(str(x)) + return ord(x) + def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) + iord = lambda a: 0 if a != a else ord_func(a) f = lambda ts: ts.map(iord) - ord("a") return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 From 2d92818669fe7e61241ecd3f8bd2788d0fca02e2 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 3 Aug 2024 06:42:05 +0000 Subject: [PATCH 061/163] Manage str.encode for pyarrow.lib.LargeStringScalar --- pandas/core/strings/object_array.py | 11 +++++++++-- pandas/tests/strings/test_strings.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 290a28ab60ae1..9d4aed176f0fa 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -239,8 +239,15 @@ def _str_fullmatch( return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_encode(self, encoding, errors: str = "strict"): - f = lambda x: x.encode(encoding, errors=errors) - return self._str_map(f, dtype=object) + def encode_func(x): + if x is str: + return x.encode(encoding=encoding, errors=errors) + else: + # Manage AttributeError: 'pyarrow.lib.LargeStringScalar' + # object has no attribute 'encode' + return str(x).encode(encoding=encoding, errors=errors) + + return self._str_map(encode_func, dtype=object) def _str_find(self, sub, start: int = 0, end=None): return self._str_find_(sub, start, end, side="left") diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..c8acb936e3d2c 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -561,7 +561,16 @@ def test_encode_errors_kwarg(any_string_dtype): ser.str.encode("cp1252") result = ser.str.encode("cp1252", "ignore") - expected = ser.map(lambda x: x.encode("cp1252", "ignore")) + + def encode_func(x): + if x is str: + return x.encode("cp1252", "ignore") + else: + # Manage AttributeError: 'pyarrow.lib.LargeStringScalar' + # object has no attribute 'encode' + return str(x).encode("cp1252", "ignore") + + expected = ser.map(encode_func).astype("object") tm.assert_series_equal(result, expected) From 56f8f161162cde4abc16077ddd7ab5588793dddc Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 4 Aug 2024 07:18:29 +0000 Subject: [PATCH 062/163] Manage string convertible to nullable dtype --- pandas/_libs/lib.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3cf312a19d072..4743fc0c30e44 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2794,7 +2794,10 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif storage is None or storage == "python": + elif ( + (convert_to_nullable_dtype and is_string_array(objects, skipna=True)) + or storage == "python" + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage=storage) From 88a54f78641ea17f39e0aeae22cd74dae2c01d11 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 5 Aug 2024 07:35:06 +0000 Subject: [PATCH 063/163] Manage Based masked dtype --- pandas/_libs/lib.pyx | 19 ++++++++++++++++++- pandas/tests/extension/test_masked.py | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4743fc0c30e44..34ed78d208e74 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2465,6 +2465,20 @@ def _maybe_convert_pyarrow_objects( return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_convert_based_masked( + ndarray[object] objects, + ndarray[uint8_t] mask, + object type) -> "ArrayLike": + from pandas.core.dtypes.dtypes import BaseMaskedDtype + + from pandas.core.construction import array as pd_array + + dtype = BaseMaskedDtype.from_numpy_dtype(np.dtype(type)) + return pd_array(objects, dtype=dtype) + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2680,7 +2694,10 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break - if storage == "pyarrow": + + if type(val) is not object and convert_to_nullable_dtype: + return _maybe_convert_based_masked(objects, mask, type(val)) + elif storage == "pyarrow": return _maybe_convert_pyarrow_objects(objects, mask, seen) # we try to coerce datetime w/tz but must all have the same tz diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index abf73333650a5..6f76ca2cabf0b 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -172,7 +172,7 @@ class TestMaskedArrays(base.ExtensionTests): def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] From 6dbbf1318f6bfa967ac4c037221fe88407959bdd Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 5 Aug 2024 07:43:12 +0000 Subject: [PATCH 064/163] Code clean up --- pandas/core/arrays/masked.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0ca2719ffd926..8659c21730795 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1319,8 +1319,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - result = map_array(self, mapper, na_action=na_action) - return pd_array(result, dtype=result.dtype) + return map_array(self, mapper, na_action=na_action) @overload def any( From 72eca608b2c10b480ee8ef3960533c70b9cf20d5 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:33:17 +0000 Subject: [PATCH 065/163] Code simplification --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 6ec847536647e..46914bafae599 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -101,7 +101,7 @@ def test_apply_simple_series(self, data): def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() From d8a70b4a31009f38fa669a82be3cc1a34b393a58 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:38:19 +0000 Subject: [PATCH 066/163] Manage pyarrow string --- pandas/_libs/lib.pyx | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 34ed78d208e74..8c9faf53653f7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2453,29 +2453,37 @@ def maybe_convert_numeric( @cython.boundscheck(False) @cython.wraparound(False) -def _maybe_convert_pyarrow_objects( - ndarray[object] objects, - ndarray[uint8_t] mask, - Seen seen) -> "ArrayLike": +def _convert_to_pyarrow( + ndarray[object] objects, + ndarray[uint8_t] mask) -> "ArrayLike": from pandas.core.dtypes.dtypes import ArrowDtype + from pandas.core.arrays.string_ import StringDtype + + na_value = None + if mask is not None and any(mask): + na_value = objects[mask][0] + objects[mask] = None pa_array = pa.array(objects) - dtype = ArrowDtype(pa_array.type) + + if pa.types.is_large_string(pa_array.type): + dtype = StringDtype(storage="pyarrow", na_value=na_value) + else: + dtype = ArrowDtype(pa_array.type) return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) @cython.boundscheck(False) @cython.wraparound(False) -def _maybe_convert_based_masked( - ndarray[object] objects, - ndarray[uint8_t] mask, - object type) -> "ArrayLike": +def _convert_to_based_masked( + ndarray[object] objects, + object numpy_dtype) -> "ArrayLike": from pandas.core.dtypes.dtypes import BaseMaskedDtype from pandas.core.construction import array as pd_array - dtype = BaseMaskedDtype.from_numpy_dtype(np.dtype(type)) + dtype = BaseMaskedDtype.from_numpy_dtype(numpy_dtype) return pd_array(objects, dtype=dtype) @@ -2549,6 +2557,7 @@ def maybe_convert_objects(ndarray[object] objects, uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0) bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) mask = np.full(n, False) + val = None for i in range(n): val = objects[i] @@ -2695,10 +2704,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break - if type(val) is not object and convert_to_nullable_dtype: - return _maybe_convert_based_masked(objects, mask, type(val)) + numpy_dtype = np.dtype(type(val)) + if numpy_dtype.kind in "biuf" and convert_to_nullable_dtype: + return _convert_to_based_masked(objects, numpy_dtype) elif storage == "pyarrow": - return _maybe_convert_pyarrow_objects(objects, mask, seen) + return _convert_to_pyarrow(objects, mask) # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: From 69269d722bdb880599729231110c1e24ec67870b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 9 Aug 2024 05:45:36 +0000 Subject: [PATCH 067/163] Manage json and decimal extension array --- pandas/core/arrays/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0cbefe1b05c31..29d7de507c1fd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2350,7 +2350,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): else: return result elif isinstance(result, np.ndarray): - return pd_array(result, result.dtype) + result_types = set(np.array([type(x) for x in result])) + + # if internal values types are compatible with self dtype + if all(issubclass(t, self.dtype.type) for t in result_types): + return pd_array(result, self.dtype) + else: + return pd_array(result, result.dtype) else: return result From d7d06145a0b4189e23db90b5d9b99675e00bb7bc Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 9 Aug 2024 05:46:16 +0000 Subject: [PATCH 068/163] Manage na_value in python string --- pandas/_libs/lib.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9d5a1bbb746c6..d6dae59df8a06 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2827,7 +2827,11 @@ def maybe_convert_objects(ndarray[object] objects, ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage=storage) + na_value = None + if mask is not None and any(mask): + na_value = objects[mask][0] + + dtype = StringDtype(storage=storage, na_value=na_value) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From 0f242b233b8b4e90a6186f55160ca7bdbbf52f8b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 9 Aug 2024 06:56:45 +0000 Subject: [PATCH 069/163] Cast to BasedMasked is limited to array containing one type --- pandas/_libs/lib.pyx | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d6dae59df8a06..939db6e6d7016 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2558,9 +2558,13 @@ def maybe_convert_objects(ndarray[object] objects, bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) mask = np.full(n, False) val = None + val_types = set() for i in range(n): val = objects[i] + if not checknull(val): + val_types.add(type(val)) + if itemsize_max != -1: itemsize = get_itemsize(val) if itemsize > itemsize_max or itemsize == -1: @@ -2705,7 +2709,10 @@ def maybe_convert_objects(ndarray[object] objects, break numpy_dtype = np.dtype(type(val)) - if numpy_dtype.kind in "biuf" and convert_to_nullable_dtype: + if ( + numpy_dtype.kind in "biuf" + and len(val_types) == 1 + and convert_to_nullable_dtype): return _convert_to_based_masked(objects, numpy_dtype) elif storage == "pyarrow": return _convert_to_pyarrow(objects, mask) @@ -2827,11 +2834,10 @@ def maybe_convert_objects(ndarray[object] objects, ): from pandas.core.arrays.string_ import StringDtype - na_value = None if mask is not None and any(mask): - na_value = objects[mask][0] - - dtype = StringDtype(storage=storage, na_value=na_value) + dtype = StringDtype(storage=storage, na_value=objects[mask][0]) + else: + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From d293ce623e0c3a7e4c9c7f9c32625e7e0f5a0ce1 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 09:39:22 +0000 Subject: [PATCH 070/163] numpy dtype is extracted from the identified types in object --- pandas/_libs/lib.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 939db6e6d7016..3a6f9102748db 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2460,7 +2460,7 @@ def _convert_to_pyarrow( from pandas.core.arrays.string_ import StringDtype - na_value = None + na_value = np.nan if mask is not None and any(mask): na_value = objects[mask][0] @@ -2708,13 +2708,15 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break - numpy_dtype = np.dtype(type(val)) - if ( - numpy_dtype.kind in "biuf" - and len(val_types) == 1 - and convert_to_nullable_dtype): - return _convert_to_based_masked(objects, numpy_dtype) - elif storage == "pyarrow": + numpy_dtype = None + if len(val_types) == 1: + numpy_dtype = np.dtype(val_types.pop()) + if ( + numpy_dtype.kind in "biuf" + and len(val_types) == 1 + and convert_to_nullable_dtype): + return _convert_to_based_masked(objects, numpy_dtype) + if storage == "pyarrow": return _convert_to_pyarrow(objects, mask) # we try to coerce datetime w/tz but must all have the same tz From d19fb2cb70c477e547656f53721bf618ce5829a4 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 09:39:48 +0000 Subject: [PATCH 071/163] Correct typo in exception --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4fa33977b579d..670ca16087b81 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -165,7 +165,7 @@ def __init__( # a consistent NaN value (and we can use `dtype.na_value is np.nan`) na_value = np.nan elif na_value is not libmissing.NA: - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") self.storage = storage self._na_value = na_value From 9363be64285eb5ab2527f449a724ce5149fe6cbf Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:07:59 +0000 Subject: [PATCH 072/163] Correct typo in based masked array conversion --- pandas/_libs/lib.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3a6f9102748db..607e1e99e1022 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2713,7 +2713,6 @@ def maybe_convert_objects(ndarray[object] objects, numpy_dtype = np.dtype(val_types.pop()) if ( numpy_dtype.kind in "biuf" - and len(val_types) == 1 and convert_to_nullable_dtype): return _convert_to_based_masked(objects, numpy_dtype) if storage == "pyarrow": From 434cb7e82a33ab3754c00afe9aac02a8698c80a4 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 21 Apr 2024 22:33:00 +0200 Subject: [PATCH 073/163] Integrate map evolution without to_numpy() conversion --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 92ed690e527c7..8659c21730795 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1319,7 +1319,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - return map_array(self.to_numpy(), mapper, na_action=na_action) + return map_array(self, mapper, na_action=na_action) @overload def any( From adc8493efdae03c90c64f7306a3e2dd774d8428a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 21 Apr 2024 22:41:30 +0200 Subject: [PATCH 074/163] Add test for map operation applied on series supporting NA as na_value --- pandas/tests/series/methods/test_map.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index fe84ffafa70b4..ad8487d3c0d73 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -257,6 +257,13 @@ def test_map_int(): assert not isna(merged["c"]) +def test_map_int_with_pd_na(): + s = Series([pd.NA, 42], dtype="Int64") + result = s.map(lambda x: 1 if x is pd.NA else 2) + expected = Series([1, 2]) + tm.assert_series_equal(result, expected) + + def test_map_type_inference(): s = Series(range(3)) s2 = s.map(lambda x: np.where(x == 0, 0, 1)) From 83d7093b048f9612dcc98b23b38f80c258c91171 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:35:38 +0200 Subject: [PATCH 075/163] Adapt test_map test to take into account series containing pd.NA --- pandas/tests/extension/test_masked.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3b9079d06e231..8dff82029d34d 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -17,6 +17,7 @@ import numpy as np import pytest +from pandas._libs import missing as libmissing from pandas.compat import ( IS64, is_platform_windows, @@ -171,17 +172,13 @@ class TestMaskedArrays(base.ExtensionTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == Float32Dtype(): - # map roundtrips through objects, which converts to float64 - expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) - else: - expected = data_missing.to_numpy() + expected = data_missing.astype(object, copy=True) tm.assert_numpy_array_equal(result, expected) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") - if data_missing_for_sorting.dtype.kind == "b": + if data_missing_for_sorting.dtype.na_value is libmissing.NA: expected = np.array([False, pd.NA, False], dtype=object) else: expected = np.array([zero, np.nan, zero]) From df473d7c66130e60d486e3741719b5bd4f627baa Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 26 Apr 2024 20:34:34 +0200 Subject: [PATCH 076/163] Add an entry in Conversion section (issue 57390) --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ee9d18d0c7ce2..cbdb15422a069 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -541,6 +541,7 @@ Conversion - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) +- Bug in :meth:`BaseMaskedArray.map` was casting ``pd.NA`` to ``np.nan``. (:issue:`57390`) Strings ^^^^^^^ From 4cb8fcf35e343061f613a256646065cb54300aae Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:18:31 +0200 Subject: [PATCH 077/163] Add the possibility to process pd.NA values --- pandas/_libs/lib.pyi | 4 ++++ pandas/_libs/lib.pyx | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index daaaacee3487d..ec679f7bfd9c9 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -74,6 +74,8 @@ def map_infer( *, convert: Literal[False], ignore_na: bool = ..., + mask: np.ndarray = ..., + na_value: Any = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -82,6 +84,8 @@ def map_infer( *, convert: bool = ..., ignore_na: bool = ..., + mask: np.ndarray = ..., + na_value: Any = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 489d4fa111d40..3736c1fff7d41 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2925,7 +2925,13 @@ def map_infer_mask( @cython.boundscheck(False) @cython.wraparound(False) def map_infer( - ndarray arr, object f, *, bint convert=True, bint ignore_na=False + ndarray arr, + object f, + *, + bint convert=True, + bint ignore_na=False, + const uint8_t[:] mask=None, + object na_value=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2937,6 +2943,10 @@ def map_infer( convert : bint ignore_na : bint If True, NA values will not have f applied + mask : ndarray, optional + uint8 dtype ndarray indicating na_value to apply `f` to. + na_value : Any, optional + The input value to use for masked values. Returns ------- @@ -2953,7 +2963,10 @@ def map_infer( if ignore_na and checknull(arr[i]): result[i] = arr[i] continue - val = f(arr[i]) + elif mask is not None and na_value is not None and mask[i]: + val = f(na_value) + else: + val = f(arr[i]) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 From 20dd8e1a446e5b68b2d85705ccd9ffd0707efc47 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:19:37 +0200 Subject: [PATCH 078/163] Add the possibility to process pd.NA values --- pandas/core/algorithms.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 948836bf6a51d..daaa9f94c155f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -22,6 +22,7 @@ hashtable as htable, iNaT, lib, + missing as libmissing, ) from pandas._typing import ( AnyArrayLike, @@ -1698,8 +1699,25 @@ def map_array( return arr.copy() # we must convert to python types - values = arr.astype(object, copy=False) + values_as_object = arr.astype(object, copy=False) if na_action is None: - return lib.map_infer(values, mapper) + if ( + isinstance(arr, ABCExtensionArray) + and arr._hasna + and arr.dtype.na_value is libmissing.NA + ): + return lib.map_infer( + arr._data, + mapper, + mask=isna(values_as_object).view(np.uint8), + na_value=arr._na_value, + ) + else: + return lib.map_infer(arr.astype(object, copy=False), mapper) else: - return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8)) + return lib.map_infer_mask( + arr.astype(object, copy=False), + mapper, + mask=isna(values_as_object).view(np.uint8), + convert=True, + ) From 45bc299267d7738764b75cc09620ca3e6f34e70d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:20:21 +0200 Subject: [PATCH 079/163] Remove test ambiguity with pd.NA processing --- pandas/tests/series/methods/test_map.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index ad8487d3c0d73..031d5b11ed959 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -258,9 +258,9 @@ def test_map_int(): def test_map_int_with_pd_na(): - s = Series([pd.NA, 42], dtype="Int64") - result = s.map(lambda x: 1 if x is pd.NA else 2) - expected = Series([1, 2]) + s = Series([pd.NA, 11, 22, pd.NA], dtype="Int64") + result = s.map(lambda x: 5 if x is pd.NA else 2 * x) + expected = Series([5, 22, 44, 5]) tm.assert_series_equal(result, expected) From e38dac0e2c51174124536400bd818ba5452c7cb7 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 27 May 2024 13:53:05 +0200 Subject: [PATCH 080/163] Code clean up --- pandas/core/algorithms.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index daaa9f94c155f..9a0ce52e04781 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1716,8 +1716,5 @@ def map_array( return lib.map_infer(arr.astype(object, copy=False), mapper) else: return lib.map_infer_mask( - arr.astype(object, copy=False), - mapper, - mask=isna(values_as_object).view(np.uint8), - convert=True, + values_as_object, mapper, mask=isna(values_as_object).view(np.uint8) ) From 5fb2d6ca1a09a443a1ee473a3fa0010c887661af Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 11:42:06 +0000 Subject: [PATCH 081/163] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/arrays/masked.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8659c21730795..483ddcdccf274 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Literal, cast, overload, @@ -72,7 +73,6 @@ from pandas.core.util.hashing import hash_array if TYPE_CHECKING: - from collections.abc import Callable from collections.abc import ( Iterator, Sequence, @@ -1198,7 +1198,7 @@ def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" - if name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: + if name in ["mean", "median", "var", "std", "skew", "kurt"]: np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name From a92fb759a64d0c268ff30401280e0115e7287181 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 29 May 2024 10:20:19 +0200 Subject: [PATCH 082/163] Limit NA management to BooleanArray, FloatingArray and IntegerArray types --- pandas/core/algorithms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9a0ce52e04781..fbfc6ac59bb9b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1702,7 +1702,7 @@ def map_array( values_as_object = arr.astype(object, copy=False) if na_action is None: if ( - isinstance(arr, ABCExtensionArray) + isinstance(arr.dtype, BaseMaskedDtype) and arr._hasna and arr.dtype.na_value is libmissing.NA ): @@ -1710,10 +1710,10 @@ def map_array( arr._data, mapper, mask=isna(values_as_object).view(np.uint8), - na_value=arr._na_value, + na_value=arr.dtype.na_value, ) else: - return lib.map_infer(arr.astype(object, copy=False), mapper) + return lib.map_infer(values_as_object, mapper) else: return lib.map_infer_mask( values_as_object, mapper, mask=isna(values_as_object).view(np.uint8) From d8005edcc9ba54b2e114d1b127a32614f3a656dd Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 29 May 2024 23:03:41 +0200 Subject: [PATCH 083/163] Try to correct BaseMaskedArray cast error detected by mypy --- pandas/core/algorithms.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fbfc6ac59bb9b..0c367f3026e82 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1701,17 +1701,15 @@ def map_array( # we must convert to python types values_as_object = arr.astype(object, copy=False) if na_action is None: - if ( - isinstance(arr.dtype, BaseMaskedDtype) - and arr._hasna - and arr.dtype.na_value is libmissing.NA - ): - return lib.map_infer( - arr._data, - mapper, - mask=isna(values_as_object).view(np.uint8), - na_value=arr.dtype.na_value, - ) + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + if arr._hasna and arr.dtype.na_value is libmissing.NA: + return lib.map_infer( + arr._data, + mapper, + mask=isna(values_as_object).view(np.uint8), + na_value=arr.dtype.na_value, + ) else: return lib.map_infer(values_as_object, mapper) else: From 99007105ba1b4bd49fd67ed5c0e029466cfd2545 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 29 May 2024 23:32:44 +0200 Subject: [PATCH 084/163] Correct typo error: missing else condition --- pandas/core/algorithms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0c367f3026e82..97e9f75203c0b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1710,6 +1710,8 @@ def map_array( mask=isna(values_as_object).view(np.uint8), na_value=arr.dtype.na_value, ) + else: + return lib.map_infer(arr._data, mapper) else: return lib.map_infer(values_as_object, mapper) else: From dfdf6ed8092f5dcc57aeb2a37ec6019bf6e604a0 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 30 May 2024 16:52:04 +0200 Subject: [PATCH 085/163] Try to correct mypy error with mask parameter in map_infer --- pandas/_libs/lib.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index ec679f7bfd9c9..a4b2045abfb98 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -74,7 +74,7 @@ def map_infer( *, convert: Literal[False], ignore_na: bool = ..., - mask: np.ndarray = ..., + mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., ) -> np.ndarray: ... @overload @@ -84,7 +84,7 @@ def map_infer( *, convert: bool = ..., ignore_na: bool = ..., - mask: np.ndarray = ..., + mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., ) -> ArrayLike: ... @overload From 451f05525d5e7560debabd115e3306edf2ca148b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 31 May 2024 11:01:13 +0200 Subject: [PATCH 086/163] Code clean up: simplify map_infer calls --- pandas/core/algorithms.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 97e9f75203c0b..ecbf2ed1a0006 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -22,7 +22,6 @@ hashtable as htable, iNaT, lib, - missing as libmissing, ) from pandas._typing import ( AnyArrayLike, @@ -1698,23 +1697,22 @@ def map_array( if not len(arr): return arr.copy() - # we must convert to python types - values_as_object = arr.astype(object, copy=False) - if na_action is None: - if isinstance(arr.dtype, BaseMaskedDtype): - arr = cast("BaseMaskedArray", arr) - if arr._hasna and arr.dtype.na_value is libmissing.NA: - return lib.map_infer( - arr._data, - mapper, - mask=isna(values_as_object).view(np.uint8), - na_value=arr.dtype.na_value, - ) - else: - return lib.map_infer(arr._data, mapper) - else: - return lib.map_infer(values_as_object, mapper) + na_value = None + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + values = arr._data + if arr._hasna: + na_value = arr.dtype.na_value else: - return lib.map_infer_mask( - values_as_object, mapper, mask=isna(values_as_object).view(np.uint8) + # we must convert to python types + values = arr.astype(object, copy=False) + mask = isna(arr) + if na_action is None: + return lib.map_infer( + values, + mapper, + mask=mask, + na_value=na_value, ) + else: + return lib.map_infer_mask(values, mapper, mask=mask) From a836ad1e2453038654b6d5a2f1a310008b5dd599 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 31 May 2024 12:48:44 +0200 Subject: [PATCH 087/163] Correct values input type for map_infer_mask --- pandas/core/algorithms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ecbf2ed1a0006..3e0f3073eaa94 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1698,7 +1698,8 @@ def map_array( return arr.copy() na_value = None - if isinstance(arr.dtype, BaseMaskedDtype): + mask = isna(arr) + if isinstance(arr.dtype, BaseMaskedDtype) and na_action is None: arr = cast("BaseMaskedArray", arr) values = arr._data if arr._hasna: @@ -1706,7 +1707,7 @@ def map_array( else: # we must convert to python types values = arr.astype(object, copy=False) - mask = isna(arr) + if na_action is None: return lib.map_infer( values, From 01a8d88ad3398f1be0d207df042a6ee9c737cce9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:29:53 +0200 Subject: [PATCH 088/163] Manage ExtensionArray and convert to nullable dtype --- pandas/core/algorithms.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3e0f3073eaa94..40b8a33289867 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1704,6 +1704,11 @@ def map_array( values = arr._data if arr._hasna: na_value = arr.dtype.na_value + elif isinstance(arr.dtype, ExtensionDtype) and na_action is None: + arr = cast("ExtensionArray", arr) + values = np.asarray(arr) + if arr._hasna: + na_value = arr.dtype.na_value else: # we must convert to python types values = arr.astype(object, copy=False) @@ -1714,6 +1719,7 @@ def map_array( mapper, mask=mask, na_value=na_value, + convert_to_nullable_dtype=True, ) else: return lib.map_infer_mask(values, mapper, mask=mask) From 0f3bfaa8a2425f17d8db0df316d27112711054ea Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:36:57 +0200 Subject: [PATCH 089/163] Add convert_to_nullable_dtype to map_infer (used in maybe_convert_objects) --- pandas/_libs/lib.pyi | 2 ++ pandas/_libs/lib.pyx | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index a4b2045abfb98..8f4c5ec2b4e36 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -76,6 +76,7 @@ def map_infer( ignore_na: bool = ..., mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., + convert_to_nullable_dtype: Literal[False] = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -86,6 +87,7 @@ def map_infer( ignore_na: bool = ..., mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., + convert_to_nullable_dtype: Literal[False] = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3736c1fff7d41..0518c60b385d3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2932,6 +2932,7 @@ def map_infer( bint ignore_na=False, const uint8_t[:] mask=None, object na_value=None, + bint convert_to_nullable_dtype=False, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2947,6 +2948,9 @@ def map_infer( uint8 dtype ndarray indicating na_value to apply `f` to. na_value : Any, optional The input value to use for masked values. + convert_to_nullable_dtype : bool, default False + If an array-like object contains only integer or boolean values (and NaN) is + encountered, whether to convert and return an Boolean/IntegerArray. Returns ------- @@ -2975,7 +2979,10 @@ def map_infer( result[i] = val if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype + ) else: return result From 122106930bfadb44fd608bf8e15313ed2e48ecbe Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:28:23 +0200 Subject: [PATCH 090/163] Add convert_to_nullable_dtype to map_infer_mask (used in maybe_convert_objects) --- pandas/_libs/lib.pyi | 2 ++ pandas/_libs/lib.pyx | 6 +++++- pandas/core/algorithms.py | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 8f4c5ec2b4e36..909ac8100b75f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -184,6 +184,7 @@ def map_infer_mask( convert: Literal[False], na_value: Any = ..., dtype: np.dtype = ..., + convert_to_nullable_dtype: bool = ..., ) -> np.ndarray: ... @overload def map_infer_mask( @@ -194,6 +195,7 @@ def map_infer_mask( convert: bool = ..., na_value: Any = ..., dtype: np.dtype = ..., + convert_to_nullable_dtype: bool = ..., ) -> ArrayLike: ... def indices_fast( index: npt.NDArray[np.intp], diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0518c60b385d3..bac4020571831 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2864,6 +2864,7 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, + bint convert_to_nullable_dtype=False, cnp.dtype dtype=np.dtype(object) ) -> "ArrayLike": """ @@ -2917,7 +2918,10 @@ def map_infer_mask( PyArray_ITER_NEXT(result_it) if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype + ) else: return result diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 40b8a33289867..386b488169457 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1722,4 +1722,6 @@ def map_array( convert_to_nullable_dtype=True, ) else: - return lib.map_infer_mask(values, mapper, mask=mask) + return lib.map_infer_mask( + values, mapper, mask=mask, convert_to_nullable_dtype=True + ) From d70be9d2578acff625686f9225e7fdff6b18680d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:31:52 +0200 Subject: [PATCH 091/163] Conversion to numpy object is not necessary anymore --- pandas/core/arrays/arrow/array.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d07bfeda50e1d..ec8b19cf8b1b4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -51,7 +51,6 @@ ops, roperator, ) -from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays._utils import to_numpy_dtype_inference @@ -1426,10 +1425,8 @@ def to_numpy( return result def map(self, mapper, na_action: Literal["ignore"] | None = None): - if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=na_action) - else: - return super().map(mapper, na_action) + result = super().map(mapper, na_action) + return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) @doc(ExtensionArray.duplicated) def duplicated( From b77da73faaa9981f80465b427b78cee13eee311e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:37:55 +0200 Subject: [PATCH 092/163] Tests results are verified as ExtensionArray --- pandas/tests/extension/test_arrow.py | 12 ++++-------- pandas/tests/extension/test_masked.py | 15 +++++++-------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dbf353d87178f..1613790d7b280 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -283,16 +283,12 @@ def test_compare_scalar(self, data, comparison_op): def test_map(self, data_missing, na_action): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.to_numpy(dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected, check_dtype=False) else: result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == "float32[pyarrow]": - # map roundtrips through objects, which converts to float64 - expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) - else: - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected, check_dtype=False) def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 8dff82029d34d..abf73333650a5 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -17,7 +17,6 @@ import numpy as np import pytest -from pandas._libs import missing as libmissing from pandas.compat import ( IS64, is_platform_windows, @@ -172,17 +171,17 @@ class TestMaskedArrays(base.ExtensionTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.astype(object, copy=True) - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected, check_dtype=False) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] + na_value = data_missing_for_sorting.dtype.na_value result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") - if data_missing_for_sorting.dtype.na_value is libmissing.NA: - expected = np.array([False, pd.NA, False], dtype=object) - else: - expected = np.array([zero, np.nan, zero]) - tm.assert_numpy_array_equal(result, expected) + expected = type(data_missing_for_sorting)._from_sequence( + [zero, na_value, zero], dtype=data_missing_for_sorting.dtype + ) + tm.assert_extension_array_equal(result, expected, check_dtype=False) def _get_expected_exception(self, op_name, obj, other): try: From ad8494aee6a513b947b465a7604c531f9ef0f22c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:40:19 +0200 Subject: [PATCH 093/163] Tests was extended to Int64, Float64 and boolean --- pandas/tests/series/methods/test_map.py | 39 ++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 031d5b11ed959..e955860fde05f 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -257,10 +257,41 @@ def test_map_int(): assert not isna(merged["c"]) -def test_map_int_with_pd_na(): - s = Series([pd.NA, 11, 22, pd.NA], dtype="Int64") - result = s.map(lambda x: 5 if x is pd.NA else 2 * x) - expected = Series([5, 22, 44, 5]) +@pytest.mark.parametrize( + "ser", + [ + Series([pd.NA, 11], dtype="Int64"), + Series([pd.NA, 11.0], dtype="Float64"), + Series([pd.NA, True], dtype="boolean"), + ], +) +def test_map_with_pd_na_input(ser): + func_return_values_only = ( + lambda x: ser.dtype.type(1) if x is pd.NA else ser.dtype.type(2 * x) + ) + result = ser.map(func_return_values_only) + expected = Series( + [func_return_values_only(ser[0]), func_return_values_only(ser[1])], + dtype=ser.dtype, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser", + [ + Series([pd.NA, 11], dtype="Int64"), + Series([pd.NA, 11.0], dtype="Float64"), + Series([pd.NA, True], dtype="boolean"), + ], +) +def test_map_with_pd_na_output(ser): + func_return_value_and_na = lambda x: x if x is pd.NA else ser.dtype.type(2 * x) + result = ser.map(func_return_value_and_na) + expected = Series( + [func_return_value_and_na(ser[0]), func_return_value_and_na(ser[1])], + dtype=ser.dtype, + ) tm.assert_series_equal(result, expected) From 0ad45c777d8f524e9bd3c979f14fc41123657e76 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 12 Jun 2024 20:30:49 +0200 Subject: [PATCH 094/163] convert to nullable dtype only if there are nullable value --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 386b488169457..2da337dec6e23 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1719,9 +1719,9 @@ def map_array( mapper, mask=mask, na_value=na_value, - convert_to_nullable_dtype=True, + convert_to_nullable_dtype=na_value is not None, ) else: return lib.map_infer_mask( - values, mapper, mask=mask, convert_to_nullable_dtype=True + values, mapper, mask=mask, convert_to_nullable_dtype=na_value is not None ) From f73e7b698bb54f0a83f4c63131910f00cb9f5c40 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 21:05:55 +0200 Subject: [PATCH 095/163] Manage date and time dtype pyarrow as object --- pandas/core/algorithms.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2da337dec6e23..b85e4b5578c7c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1699,6 +1699,7 @@ def map_array( na_value = None mask = isna(arr) + storage = None if isinstance(arr.dtype, BaseMaskedDtype) and na_action is None: arr = cast("BaseMaskedArray", arr) values = arr._data @@ -1706,7 +1707,15 @@ def map_array( na_value = arr.dtype.na_value elif isinstance(arr.dtype, ExtensionDtype) and na_action is None: arr = cast("ExtensionArray", arr) - values = np.asarray(arr) + arr_dtype = arr.dtype.__repr__() + if "pyarrow" in arr_dtype: + if "date" in arr_dtype or "time" in arr_dtype: + values = arr.astype(object, copy=False) + else: + values = arr._pa_array.to_numpy() + storage = "pyarrow" + else: + values = np.asarray(arr) if arr._hasna: na_value = arr.dtype.na_value else: @@ -1720,6 +1729,7 @@ def map_array( mask=mask, na_value=na_value, convert_to_nullable_dtype=na_value is not None, + storage=storage, ) else: return lib.map_infer_mask( From 972957ca43ec30a7a80a4cf7e9d0bc1f6aa4001a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 21:35:59 +0200 Subject: [PATCH 096/163] Manage pyarrow string --- pandas/core/arrays/arrow/array.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ec8b19cf8b1b4..c583abcfd1c4b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -339,7 +339,12 @@ def _from_sequence_of_strings( elif pa.types.is_date(pa_type): from pandas.core.tools.datetimes import to_datetime - scalars = to_datetime(strings, errors="raise").date + if isinstance(strings, ExtensionArray) and isinstance( + strings.dtype, ArrowDtype + ): + scalars = to_datetime(strings._pa_array, errors="raise").date + else: + scalars = to_datetime(strings, errors="raise").date elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta @@ -1426,7 +1431,10 @@ def to_numpy( def map(self, mapper, na_action: Literal["ignore"] | None = None): result = super().map(mapper, na_action) - return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) + if isinstance(result.dtype, StringDtype): + return result + else: + return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) @doc(ExtensionArray.duplicated) def duplicated( From 4f6fd090e73e80334ee4d1c774157ad704438384 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:06:59 +0200 Subject: [PATCH 097/163] Manage pyarrow string --- pandas/_libs/lib.pyi | 5 +++++ pandas/_libs/lib.pyx | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 909ac8100b75f..874b565a69b1a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -77,6 +77,7 @@ def map_infer( mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., convert_to_nullable_dtype: Literal[False] = ..., + storage: str | None = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -88,6 +89,7 @@ def map_infer( mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., convert_to_nullable_dtype: Literal[False] = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -99,6 +101,7 @@ def maybe_convert_objects( convert_non_numeric: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload def maybe_convert_objects( @@ -110,6 +113,7 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -121,6 +125,7 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_numeric( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bac4020571831..e0dd7e75a02a0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_string_dtype +from pandas._config import using_pyarrow_string_dtype from pandas._libs.missing import check_na_tuples_nonequal From 59d4c3e5f264ad3b57a1c1e1c1e45b9135f7a218 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:09:02 +0200 Subject: [PATCH 098/163] Manage BasedMaskedArray --- pandas/core/arrays/masked.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 483ddcdccf274..2fcb5db07327f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1319,7 +1319,8 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - return map_array(self, mapper, na_action=na_action) + result = map_array(self, mapper, na_action=na_action) + return type(self)._from_sequence(result) @overload def any( From b8f8e23b60dc40fc125edadb2b63a5f3ee95cccb Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:10:10 +0200 Subject: [PATCH 099/163] Test directly ExtensionArray --- pandas/tests/extension/base/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index dd2ed0bd62a02..46914bafae599 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -100,8 +100,8 @@ def test_apply_simple_series(self, data): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() From 41c13f3fda631d2cbc88df93e377082a6d32ca6c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:12:07 +0200 Subject: [PATCH 100/163] pyarrow data keep their original type if possible --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1613790d7b280..2ca181b2582af 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3562,5 +3562,5 @@ def test_cast_dictionary_different_value_dtype(arrow_type): def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + expected = pd.Series([42.0, 42.0, np.nan], dtype=ser.dtype) tm.assert_series_equal(result, expected) From f87ee613985e35aed215d9132df97b47d6b36c10 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 19 Jun 2024 22:14:21 +0200 Subject: [PATCH 101/163] if map return only pd.NA values their type is double pyarrow --- pandas/tests/series/methods/test_map.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index e955860fde05f..19e3a3f66aa99 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -232,7 +232,15 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - expected = Series(np.nan, index=s.index) + na_value = np.nan + dtype = "float64" + + # In pyarrow double is the equivalent of float64 + # Cf: https://arrow.apache.org/docs/python/pandas.html#pandas-arrow-conversion + if "pyarrow" in s.dtype.__repr__(): + dtype = "double[pyarrow]" + na_value = pd.NA + expected = Series(na_value, index=s.index, dtype=dtype) tm.assert_series_equal(result, expected) @@ -283,6 +291,8 @@ def test_map_with_pd_na_input(ser): Series([pd.NA, 11], dtype="Int64"), Series([pd.NA, 11.0], dtype="Float64"), Series([pd.NA, True], dtype="boolean"), + Series([pd.NA, "AAA"], dtype="string"), + Series([pd.NA, "AAA"], dtype="string[pyarrow]"), ], ) def test_map_with_pd_na_output(ser): From 48c2dd54f851c30166d776e38b2f1eaee12d8e96 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:34:48 +0200 Subject: [PATCH 102/163] Add storage to map_infer_mask --- pandas/_libs/lib.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 874b565a69b1a..27a40c62cdecb 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -190,6 +190,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + storage: str | None = ..., ) -> np.ndarray: ... @overload def map_infer_mask( @@ -201,6 +202,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + storage: str | None = ..., ) -> ArrayLike: ... def indices_fast( index: npt.NDArray[np.intp], From d5aeef270dd2e193a7364bcdf2a1faefe19c5935 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:35:28 +0200 Subject: [PATCH 103/163] Add storage to map_infer_mask --- pandas/_libs/lib.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e0dd7e75a02a0..9d814caee2875 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2865,7 +2865,8 @@ def map_infer_mask( bint convert=True, object na_value=no_default, bint convert_to_nullable_dtype=False, - cnp.dtype dtype=np.dtype(object) + cnp.dtype dtype=np.dtype(object), + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2883,6 +2884,8 @@ def map_infer_mask( input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. + storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + Backend storage Returns ------- @@ -2920,7 +2923,8 @@ def map_infer_mask( if convert: return maybe_convert_objects( result, - convert_to_nullable_dtype=convert_to_nullable_dtype + convert_to_nullable_dtype=convert_to_nullable_dtype, + storage=storage, ) else: return result From 9cd640f68889047a7fdaf84ab4d3ce2918f7f925 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:36:37 +0200 Subject: [PATCH 104/163] Add empty dict as NA value for JSONArray extension --- pandas/_libs/missing.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2f44128cda822..58843356f753e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -148,6 +148,7 @@ cpdef bint checknull(object val): - np.timedelta64 representation of NaT - NA - Decimal("NaN") + - {} empty dict Parameters ---------- @@ -157,7 +158,7 @@ cpdef bint checknull(object val): ------- bool """ - if val is None or val is NaT or val is C_NA: + if val is None or val is NaT or val is C_NA or val == {}: return True elif util.is_float_object(val) or util.is_complex_object(val): if val != val: @@ -191,6 +192,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): - np.timedelta64 representation of NaT - NA - Decimal("NaN") + - {} empty dict Parameters ---------- From 05c01e654102830a85571caef8c2b78da95d23d7 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:50:41 +0200 Subject: [PATCH 105/163] Add storage parameter to map_infer_mask --- pandas/core/algorithms.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b85e4b5578c7c..e3c2577f1655e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,7 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -1700,16 +1701,18 @@ def map_array( na_value = None mask = isna(arr) storage = None - if isinstance(arr.dtype, BaseMaskedDtype) and na_action is None: + if isinstance(arr.dtype, BaseMaskedDtype): arr = cast("BaseMaskedArray", arr) values = arr._data if arr._hasna: na_value = arr.dtype.na_value - elif isinstance(arr.dtype, ExtensionDtype) and na_action is None: + elif isinstance(arr.dtype, ExtensionDtype): arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() if "pyarrow" in arr_dtype: - if "date" in arr_dtype or "time" in arr_dtype: + if any( + time_type in arr_dtype for time_type in ["date", "time", "duration"] + ): values = arr.astype(object, copy=False) else: values = arr._pa_array.to_numpy() @@ -1728,10 +1731,14 @@ def map_array( mapper, mask=mask, na_value=na_value, - convert_to_nullable_dtype=na_value is not None, + convert_to_nullable_dtype=na_value is NA, storage=storage, ) else: return lib.map_infer_mask( - values, mapper, mask=mask, convert_to_nullable_dtype=na_value is not None + values, + mapper, + mask=mask, + convert_to_nullable_dtype=na_value is NA, + storage=storage, ) From 12444060c4754f24bc9c543ecbe54a033a55bc03 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:52:34 +0200 Subject: [PATCH 106/163] Cast result to an extension array --- pandas/core/arrays/arrow/array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c583abcfd1c4b..d5f0bbbd85891 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -61,6 +61,7 @@ from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com +from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -1434,7 +1435,7 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): if isinstance(result.dtype, StringDtype): return result else: - return ArrowExtensionArray._from_sequence(result, dtype=result.dtype.type) + return pd_array(result, dtype=result.dtype) @doc(ExtensionArray.duplicated) def duplicated( From 47e3c241d5e99b7a53f5bd3038b180d83b5d7625 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:54:21 +0200 Subject: [PATCH 107/163] Cast result to a NumpyExtensionArray an extension array --- pandas/core/arrays/base.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b429b7c1b1fc4..e70dd1ce1ab03 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -46,7 +46,10 @@ is_scalar, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + NumpyEADtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, @@ -69,6 +72,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.construction import array as pd_array from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, @@ -2339,7 +2343,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the function returns a tuple with more than one element a MultiIndex will be returned. """ - return map_array(self, mapper, na_action=na_action) + result = map_array(self, mapper, na_action=na_action) + if isinstance(self.dtype, NumpyEADtype): + return pd_array(result, dtype=NumpyEADtype(result.dtype)) + elif isinstance(result, np.ndarray): + return pd_array(result) + else: + return result # ------------------------------------------------------------------------ # GroupBy Methods From 18ae900f280cc7cbc8e9d9c24e83005f1f056f61 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:55:07 +0200 Subject: [PATCH 108/163] Cast result to an extension array --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2fcb5db07327f..d3ee05c6b4611 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1320,7 +1320,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): def map(self, mapper, na_action: Literal["ignore"] | None = None): result = map_array(self, mapper, na_action=na_action) - return type(self)._from_sequence(result) + return pd_array(result, dtype=result.dtype) @overload def any( From 1df039623e4ae5aea5ec61734c16b1a8dab893fa Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 23 Jun 2024 20:55:42 +0200 Subject: [PATCH 109/163] Remove dtype test --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 46914bafae599..6ec847536647e 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -101,7 +101,7 @@ def test_apply_simple_series(self, data): def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected, check_dtype=False) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() From 20de0405b29ebd6937cd0d87e9edb5be9d940ed7 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:23:57 +0200 Subject: [PATCH 110/163] Take into account UserDict in checknull --- pandas/_libs/missing.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 58843356f753e..7369037387b9c 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,3 +1,4 @@ +from collections import UserDict from decimal import Decimal import numbers from sys import maxsize @@ -148,7 +149,7 @@ cpdef bint checknull(object val): - np.timedelta64 representation of NaT - NA - Decimal("NaN") - - {} empty dict + - {} empty dict or UserDict Parameters ---------- @@ -158,7 +159,12 @@ cpdef bint checknull(object val): ------- bool """ - if val is None or val is NaT or val is C_NA or val == {}: + if ( + val is None + or val is NaT + or val is C_NA + or (isinstance(val, (dict, UserDict)) and not val) + ): return True elif util.is_float_object(val) or util.is_complex_object(val): if val != val: @@ -192,7 +198,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): - np.timedelta64 representation of NaT - NA - Decimal("NaN") - - {} empty dict + - {} empty dict or UserDict Parameters ---------- From 7798eeeb82214b74254d5810243d27f1858e5f9c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:24:29 +0200 Subject: [PATCH 111/163] Take into na_value in in map_infer_mask --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e3c2577f1655e..75f5343b447a6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1739,6 +1739,7 @@ def map_array( values, mapper, mask=mask, + na_value=na_value, convert_to_nullable_dtype=na_value is NA, storage=storage, ) From a88295f3da1dd8e59004a6a89db59f1555e75ae4 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 15:01:57 +0200 Subject: [PATCH 112/163] Manage IntervalDtype --- pandas/core/arrays/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e70dd1ce1ab03..34f3cf585f22b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,6 +48,7 @@ ) from pandas.core.dtypes.dtypes import ( ExtensionDtype, + IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -2346,6 +2347,8 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): result = map_array(self, mapper, na_action=na_action) if isinstance(self.dtype, NumpyEADtype): return pd_array(result, dtype=NumpyEADtype(result.dtype)) + if isinstance(self.dtype, IntervalDtype): + return result elif isinstance(result, np.ndarray): return pd_array(result) else: From e09f8784e19c05fbd4d95b08d31ecf1ff5967276 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:05:33 +0200 Subject: [PATCH 113/163] Manage ArrowDType int64 --- pandas/_libs/lib.pyx | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9d814caee2875..493824f58176d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2815,12 +2815,19 @@ def maybe_convert_objects(ndarray[object] objects, # TODO: do these after the itemsize check? if (result is ints or result is uints) and convert_to_nullable_dtype: - from pandas.core.arrays import IntegerArray + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype + + dtype = ArrowDtype(pa.int64()) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + else: + from pandas.core.arrays import IntegerArray + + # Set these values to 1 to be deterministic, match + # IntegerDtype._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) - # Set these values to 1 to be deterministic, match - # IntegerDtype._internal_fill_value - result[mask] = 1 - result = IntegerArray(result, mask) elif result is floats and convert_to_nullable_dtype: from pandas.core.arrays import FloatingArray From 55b8992b60c4a7738e2858cd27a4ad01e59c5483 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 23:55:05 +0200 Subject: [PATCH 114/163] Correct error in empty mapper management --- pandas/core/algorithms.py | 7 +++++-- pandas/tests/series/methods/test_map.py | 15 ++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 75f5343b447a6..ef350db9dcb9f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1680,7 +1680,10 @@ def map_array( from pandas import Series if len(mapper) == 0: - mapper = Series(mapper, dtype=np.float64) + if is_extension_array_dtype(arr.dtype) and arr.dtype.na_value is NA: + mapper = Series(mapper, dtype=arr.dtype) + else: + mapper = Series(mapper, dtype=np.float64) else: mapper = Series(mapper) @@ -1698,7 +1701,7 @@ def map_array( if not len(arr): return arr.copy() - na_value = None + na_value = np.nan mask = isna(arr) storage = None if isinstance(arr.dtype, BaseMaskedDtype): diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 19e3a3f66aa99..7a04bb380f449 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + import pandas as pd from pandas import ( DataFrame, @@ -232,14 +234,13 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - na_value = np.nan - dtype = "float64" + if is_extension_array_dtype(s.dtype) and s.dtype.na_value is pd.NA: + na_value = s.dtype.na_value + dtype = s.dtype + else: + na_value = np.nan + dtype = "float64" - # In pyarrow double is the equivalent of float64 - # Cf: https://arrow.apache.org/docs/python/pandas.html#pandas-arrow-conversion - if "pyarrow" in s.dtype.__repr__(): - dtype = "double[pyarrow]" - na_value = pd.NA expected = Series(na_value, index=s.index, dtype=dtype) tm.assert_series_equal(result, expected) From f7d875c6a35ca934eeba3357d5d2c2e1059a215b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 2 Jul 2024 21:47:12 +0200 Subject: [PATCH 115/163] Manage IntervalDtype --- pandas/core/algorithms.py | 7 ++++++- pandas/core/arrays/base.py | 12 ++++++------ pandas/tests/series/methods/test_map.py | 8 ++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ef350db9dcb9f..cd68e6a43b69b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -62,6 +62,7 @@ BaseMaskedDtype, CategoricalDtype, ExtensionDtype, + IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -1680,7 +1681,11 @@ def map_array( from pandas import Series if len(mapper) == 0: - if is_extension_array_dtype(arr.dtype) and arr.dtype.na_value is NA: + if ( + is_extension_array_dtype(arr.dtype) + and not isinstance(arr.dtype, IntervalDtype) + and arr.dtype.na_value is NA + ): mapper = Series(mapper, dtype=arr.dtype) else: mapper = Series(mapper, dtype=np.float64) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 34f3cf585f22b..0cbefe1b05c31 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,7 +48,6 @@ ) from pandas.core.dtypes.dtypes import ( ExtensionDtype, - IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -2345,12 +2344,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): a MultiIndex will be returned. """ result = map_array(self, mapper, na_action=na_action) - if isinstance(self.dtype, NumpyEADtype): - return pd_array(result, dtype=NumpyEADtype(result.dtype)) - if isinstance(self.dtype, IntervalDtype): - return result + if isinstance(result, ExtensionArray): + if isinstance(self.dtype, NumpyEADtype): + return pd_array(result, dtype=NumpyEADtype(result.dtype)) + else: + return result elif isinstance(result, np.ndarray): - return pd_array(result) + return pd_array(result, result.dtype) else: return result diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 7a04bb380f449..fd2b49c7798b0 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -9,6 +9,7 @@ import pytest from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import IntervalDtype import pandas as pd from pandas import ( @@ -233,8 +234,11 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - - if is_extension_array_dtype(s.dtype) and s.dtype.na_value is pd.NA: + if ( + is_extension_array_dtype(s.dtype) + and not isinstance(s.dtype, IntervalDtype) + and s.dtype.na_value is pd.NA + ): na_value = s.dtype.na_value dtype = s.dtype else: From 4f7574fdb22b8062eba45d76cabb8d1c0f999f14 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 14 Jul 2024 18:00:06 +0200 Subject: [PATCH 116/163] Try to manage date with pyarrow --- pandas/_libs/lib.pyx | 74 ++++++++++++++++++++++------ pandas/core/algorithms.py | 6 +-- pandas/tests/extension/test_arrow.py | 2 +- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 493824f58176d..a8039261d6cfa 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1300,6 +1300,7 @@ cdef class Seen: bint object_ # seen_object bint complex_ # seen_complex bint datetime_ # seen_datetime + bint date_ # seen_date bint coerce_numeric # coerce data to numeric bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz @@ -1328,6 +1329,7 @@ cdef class Seen: self.object_ = False self.complex_ = False self.datetime_ = False + self.date_ = False self.timedelta_ = False self.datetimetz_ = False self.period_ = False @@ -2607,6 +2609,13 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif PyDate_Check(val): + if convert_non_numeric: + seen.date_ = True + break + else: + seen.object_ = True + break elif is_period_object(val): if convert_non_numeric: seen.period_ = True @@ -2650,21 +2659,46 @@ def maybe_convert_objects(ndarray[object] objects, # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if is_datetime_with_singletz_array(objects): - from pandas import DatetimeIndex + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype - try: - dti = DatetimeIndex(objects) - except OutOfBoundsDatetime: - # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds - pass + if isinstance(val, datetime): + objects[mask] = None else: - # unbox to DatetimeArray - return dti._data - seen.object_ = True + objects[mask] = np.datetime64("NaT") + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + else: + if is_datetime_with_singletz_array(objects): + from pandas import DatetimeIndex + + try: + dti = DatetimeIndex(objects) + except OutOfBoundsDatetime: + # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds + pass + else: + # unbox to DatetimeArray + return dti._data + seen.object_ = True elif seen.datetime_: - if is_datetime_or_datetime64_array(objects): + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype + + if isinstance(val, datetime): + objects[mask] = None + else: + objects[mask] = np.datetime64("NaT") + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + elif is_datetime_or_datetime64_array(objects): from pandas import DatetimeIndex try: @@ -2676,6 +2710,16 @@ def maybe_convert_objects(ndarray[object] objects, return dti._data._ndarray seen.object_ = True + elif seen.date_: + if storage == "pyarrow": + + from pandas.core.dtypes.dtypes import ArrowDtype + + objects[mask] = None + pa_array = pa.array(objects) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + elif seen.timedelta_: if is_timedelta_or_timedelta64_array(objects): from pandas import TimedeltaIndex @@ -2905,17 +2949,16 @@ def map_infer_mask( ndarray result = np.empty(n, dtype=dtype) - flatiter arr_it = PyArray_IterNew(arr) flatiter result_it = PyArray_IterNew(result) for i in range(n): if mask[i]: if na_value is no_default: - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = arr[i] else: val = na_value else: - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = arr[i] val = f(val) if cnp.PyArray_IsZeroDim(val): @@ -2923,14 +2966,13 @@ def map_infer_mask( val = val.item() PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) - - PyArray_ITER_NEXT(arr_it) PyArray_ITER_NEXT(result_it) if convert: return maybe_convert_objects( result, convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=True, storage=storage, ) else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cd68e6a43b69b..65aafba2c1cde 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1718,12 +1718,10 @@ def map_array( arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() if "pyarrow" in arr_dtype: - if any( - time_type in arr_dtype for time_type in ["date", "time", "duration"] - ): + if any(time_type in arr_dtype for time_type in ["duration"]): values = arr.astype(object, copy=False) else: - values = arr._pa_array.to_numpy() + values = np.asarray(arr) storage = "pyarrow" else: values = np.asarray(arr) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 2ca181b2582af..1d0bb5c080015 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -284,7 +284,7 @@ def test_map(self, data_missing, na_action): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing From 77965c3d19d882deb90e4db12bebde5d9c029ee9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 20:39:37 +0000 Subject: [PATCH 117/163] Manage timedelta, datetimetz and date --- pandas/_libs/lib.pyx | 49 +++++++++++++++++++++++++++++---------- pandas/core/algorithms.py | 8 +++---- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a8039261d6cfa..2ab2540af4952 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -96,6 +96,9 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) +from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -2609,7 +2612,10 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break - elif PyDate_Check(val): + elif ( + PyDate_Check(val) + or (pa is not None and isinstance(val, (pa.Date32Scalar, pa.Date64Scalar))) + ): if convert_non_numeric: seen.date_ = True break @@ -2662,12 +2668,16 @@ def maybe_convert_objects(ndarray[object] objects, if storage == "pyarrow": from pandas.core.dtypes.dtypes import ArrowDtype + datetime64_array = None if isinstance(val, datetime): objects[mask] = None + datetime64_array = objects.astype(Timestamp) else: objects[mask] = np.datetime64("NaT") - datetime64_array = objects.astype(val.dtype) - pa_array = pa.array(datetime64_array) + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array).cast( + pa.timestamp(val.resolution.unit, val.tzinfo) + ) dtype = ArrowDtype(pa_array.type) return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) @@ -2721,17 +2731,32 @@ def maybe_convert_objects(ndarray[object] objects, return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) elif seen.timedelta_: - if is_timedelta_or_timedelta64_array(objects): - from pandas import TimedeltaIndex + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype - try: - tdi = TimedeltaIndex(objects) - except OutOfBoundsTimedelta: - pass + timedelta64_array = None + if isinstance(val, timedelta): + objects[mask] = None + timedelta64_array = objects.astype(Timedelta) else: - # unbox to ndarray[timedelta64[ns]] - return tdi._data._ndarray - seen.object_ = True + objects[mask] = np.timedelta64("NaT") + timedelta64_array = objects.astype(val.dtype) + pa_array = pa.array(timedelta64_array) + + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + else: + if is_timedelta_or_timedelta64_array(objects): + from pandas import TimedeltaIndex + + try: + tdi = TimedeltaIndex(objects) + except OutOfBoundsTimedelta: + pass + else: + # unbox to ndarray[timedelta64[ns]] + return tdi._data._ndarray + seen.object_ = True elif seen.period_: if is_period_array(objects): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 65aafba2c1cde..36ddb99bea031 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1718,13 +1718,11 @@ def map_array( arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() if "pyarrow" in arr_dtype: - if any(time_type in arr_dtype for time_type in ["duration"]): - values = arr.astype(object, copy=False) + storage = "pyarrow" + if "date" in arr_dtype: + values = np.fromiter(arr._pa_array, dtype='O') else: values = np.asarray(arr) - storage = "pyarrow" - else: - values = np.asarray(arr) if arr._hasna: na_value = arr.dtype.na_value else: From a01b55e7bc9ec0f1ee541546b971a56318fe0d0a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 20:57:46 +0000 Subject: [PATCH 118/163] pylint fix --- pandas/_libs/lib.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2ab2540af4952..0f2eea517de87 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -96,8 +96,9 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) -from pandas._libs.tslibs.timestamps import Timestamp + from pandas._libs.tslibs.timedeltas import Timedelta +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( From f5efabca6a376270f52cce0ac87f32c4858a5e39 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 21:04:14 +0000 Subject: [PATCH 119/163] Code simplification --- pandas/tests/series/methods/test_map.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index fd2b49c7798b0..7e4c5ad647c84 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -297,7 +297,6 @@ def test_map_with_pd_na_input(ser): Series([pd.NA, 11.0], dtype="Float64"), Series([pd.NA, True], dtype="boolean"), Series([pd.NA, "AAA"], dtype="string"), - Series([pd.NA, "AAA"], dtype="string[pyarrow]"), ], ) def test_map_with_pd_na_output(ser): From d8c021918199087a3b5d43007afbdcb4d7f68b92 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 25 Jul 2024 21:15:27 +0000 Subject: [PATCH 120/163] Correct values initialization problem --- pandas/core/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 36ddb99bea031..4c199d8f604b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1720,9 +1720,11 @@ def map_array( if "pyarrow" in arr_dtype: storage = "pyarrow" if "date" in arr_dtype: - values = np.fromiter(arr._pa_array, dtype='O') + values = np.fromiter(arr._pa_array, dtype="O") else: values = np.asarray(arr) + else: + values = np.asarray(arr) if arr._hasna: na_value = arr.dtype.na_value else: From a89373c711043a06f29d7b522526085fc43b3481 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 26 Jul 2024 17:52:56 +0000 Subject: [PATCH 121/163] Manage pyarrow and python storage --- pandas/_libs/lib.pyx | 22 ++++++++++++++++++---- pandas/core/algorithms.py | 5 ++++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0f2eea517de87..314f49b89d6ca 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2460,7 +2460,8 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, - object dtype_if_all_nat=None) -> "ArrayLike": + object dtype_if_all_nat=None, + str storage=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2483,6 +2484,8 @@ def maybe_convert_objects(ndarray[object] objects, Whether to convert datetime, timedelta, period, interval types. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + Backend storage Returns ------- @@ -2775,10 +2778,16 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif ( + (convert_to_nullable_dtype and is_string_array(objects, skipna=True)) + or storage == "python" + ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + if mask is not None and any(mask): + dtype = StringDtype(storage=storage, na_value=objects[mask][0]) + else: + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True @@ -3016,6 +3025,7 @@ def map_infer( const uint8_t[:] mask=None, object na_value=None, bint convert_to_nullable_dtype=False, + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -3034,6 +3044,8 @@ def map_infer( convert_to_nullable_dtype : bool, default False If an array-like object contains only integer or boolean values (and NaN) is encountered, whether to convert and return an Boolean/IntegerArray. + storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + Backend storage Returns ------- @@ -3064,7 +3076,9 @@ def map_infer( if convert: return maybe_convert_objects( result, - convert_to_nullable_dtype=convert_to_nullable_dtype + convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=True, + storage=storage, ) else: return result diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4c199d8f604b6..2642195be53d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1717,7 +1717,10 @@ def map_array( elif isinstance(arr.dtype, ExtensionDtype): arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() - if "pyarrow" in arr_dtype: + if "python" in arr_dtype: + storage = "python" + values = np.asarray(arr) + elif "pyarrow" in arr_dtype: storage = "pyarrow" if "date" in arr_dtype: values = np.fromiter(arr._pa_array, dtype="O") From 89b872fb42eb81362669315a2c1112a4ed0cd15c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 28 Jul 2024 05:54:04 +0000 Subject: [PATCH 122/163] Manage pyarrow and python storage in map dict like --- pandas/_libs/lib.pyx | 23 +++++------- pandas/core/algorithms.py | 79 +++++++++++++++++++++++++-------------- 2 files changed, 61 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 314f49b89d6ca..b037a655bf4e7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2484,7 +2484,7 @@ def maybe_convert_objects(ndarray[object] objects, Whether to convert datetime, timedelta, period, interval types. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. - storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" Backend storage Returns @@ -2504,6 +2504,9 @@ def maybe_convert_objects(ndarray[object] objects, object val float64_t fnan = NaN + if storage is None: + storage="python" + if dtype_if_all_nat is not None: # in practice we don't expect to ever pass dtype_if_all_nat # without both convert_non_numeric, so disallow @@ -2772,22 +2775,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(na_value=np.nan) + dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif ( - (convert_to_nullable_dtype and is_string_array(objects, skipna=True)) - or storage == "python" - ): + elif storage == "pyarrow" or storage == "python": from pandas.core.arrays.string_ import StringDtype - if mask is not None and any(mask): - dtype = StringDtype(storage=storage, na_value=objects[mask][0]) - else: - dtype = StringDtype(storage=storage) + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True @@ -2970,7 +2967,7 @@ def map_infer_mask( input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. - storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None Backend storage Returns @@ -3044,7 +3041,7 @@ def map_infer( convert_to_nullable_dtype : bool, default False If an array-like object contains only integer or boolean values (and NaN) is encountered, whether to convert and return an Boolean/IntegerArray. - storage : {"pyarrow", "pyarrow_numpy"}, default "pyarrow_numpy" + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None Backend storage Returns diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2642195be53d8..a5d77770367ce 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1690,7 +1690,10 @@ def map_array( else: mapper = Series(mapper, dtype=np.float64) else: - mapper = Series(mapper) + if arr.dtype in ("string[pyarrow]", "string[python]"): + mapper = Series(mapper, dtype=arr.dtype) + else: + mapper = Series(mapper) if isinstance(mapper, ABCSeries): if na_action == "ignore": @@ -1706,33 +1709,7 @@ def map_array( if not len(arr): return arr.copy() - na_value = np.nan - mask = isna(arr) - storage = None - if isinstance(arr.dtype, BaseMaskedDtype): - arr = cast("BaseMaskedArray", arr) - values = arr._data - if arr._hasna: - na_value = arr.dtype.na_value - elif isinstance(arr.dtype, ExtensionDtype): - arr = cast("ExtensionArray", arr) - arr_dtype = arr.dtype.__repr__() - if "python" in arr_dtype: - storage = "python" - values = np.asarray(arr) - elif "pyarrow" in arr_dtype: - storage = "pyarrow" - if "date" in arr_dtype: - values = np.fromiter(arr._pa_array, dtype="O") - else: - values = np.asarray(arr) - else: - values = np.asarray(arr) - if arr._hasna: - na_value = arr.dtype.na_value - else: - # we must convert to python types - values = arr.astype(object, copy=False) + mask, na_value, storage, values = _build_map_infer_methods_params(arr) if na_action is None: return lib.map_infer( @@ -1752,3 +1729,49 @@ def map_array( convert_to_nullable_dtype=na_value is NA, storage=storage, ) + + +def _build_map_infer_methods_params(arr: ArrayLike): + """ + Process lib.map_infer and lib.map_infer_mask parameters from an array `arr` + + Parameters + ---------- + arr + + Returns + ------- + mask : np.ndarray[bool] + na_value : object + A value in `values` to consider missing. + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" + Backend storage + values : np.ndarray + Values to be processed by lib.map_infer and lib.map_infer_mask + + """ + na_value = np.nan + mask = isna(arr) + storage = "python" + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + values = arr._data + if arr._hasna: + na_value = arr.dtype.na_value + + elif isinstance(arr.dtype, ExtensionDtype): + arr = cast("ExtensionArray", arr) + arr_dtype = arr.dtype.__repr__() + if "pyarrow" in arr_dtype and "date" in arr_dtype: + values = np.fromiter(arr._pa_array, dtype="O") + else: + values = np.asarray(arr) + if "pyarrow" in arr_dtype: + storage = "pyarrow" + if arr._hasna: + na_value = arr.dtype.na_value + + else: + # we must convert to python types + values = arr.astype(object, copy=False) + return mask, na_value, storage, values From d9f9319c6d7120db0130801a0795404ee964c414 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 28 Jul 2024 07:19:08 +0000 Subject: [PATCH 123/163] Correct wrong default storage type --- pandas/core/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a5d77770367ce..1a1eb12d86a6a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1752,7 +1752,7 @@ def _build_map_infer_methods_params(arr: ArrayLike): """ na_value = np.nan mask = isna(arr) - storage = "python" + storage = None if isinstance(arr.dtype, BaseMaskedDtype): arr = cast("BaseMaskedArray", arr) values = arr._data @@ -1768,6 +1768,8 @@ def _build_map_infer_methods_params(arr: ArrayLike): values = np.asarray(arr) if "pyarrow" in arr_dtype: storage = "pyarrow" + if "python" in arr_dtype: + storage = "python" if arr._hasna: na_value = arr.dtype.na_value From 038cfb80a3bc528cd895e7127861559c95d1f5e0 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 1 Aug 2024 07:39:37 +0000 Subject: [PATCH 124/163] Add convert_non_numeric as map_infer_mask parameter --- pandas/_libs/lib.pyi | 2 ++ pandas/_libs/lib.pyx | 5 ++++- pandas/core/algorithms.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 27a40c62cdecb..eaf56c5ab652d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -190,6 +190,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + convert_non_numeric: bool = ..., storage: str | None = ..., ) -> np.ndarray: ... @overload @@ -202,6 +203,7 @@ def map_infer_mask( na_value: Any = ..., dtype: np.dtype = ..., convert_to_nullable_dtype: bool = ..., + convert_non_numeric: bool = ..., storage: str | None = ..., ) -> ArrayLike: ... def indices_fast( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b037a655bf4e7..994fdf84b51db 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2948,6 +2948,7 @@ def map_infer_mask( bint convert=True, object na_value=no_default, bint convert_to_nullable_dtype=False, + convert_non_numeric=False, cnp.dtype dtype=np.dtype(object), str storage=None, ) -> "ArrayLike": @@ -2965,6 +2966,8 @@ def map_infer_mask( na_value : Any, optional The result value to use for masked values. By default, the input value is used. + convert_non_numeric : bool, default False + Whether to convert datetime, timedelta, period, interval types. dtype : numpy.dtype The numpy dtype to use for the result ndarray. storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None @@ -3004,7 +3007,7 @@ def map_infer_mask( return maybe_convert_objects( result, convert_to_nullable_dtype=convert_to_nullable_dtype, - convert_non_numeric=True, + convert_non_numeric=convert_non_numeric, storage=storage, ) else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1a1eb12d86a6a..9c9d851830bab 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1727,6 +1727,7 @@ def map_array( mask=mask, na_value=na_value, convert_to_nullable_dtype=na_value is NA, + convert_non_numeric=True, storage=storage, ) From a885b7a40d9e8cc9ee7e39c4ea1e2122a059d29f Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 07:57:26 +0000 Subject: [PATCH 125/163] pyarrow data are sent to map_infer as iterator --- pandas/core/algorithms.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9c9d851830bab..3a0ddf2a0b944 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1763,12 +1763,11 @@ def _build_map_infer_methods_params(arr: ArrayLike): elif isinstance(arr.dtype, ExtensionDtype): arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() - if "pyarrow" in arr_dtype and "date" in arr_dtype: + if "pyarrow" in arr_dtype: + storage = "pyarrow" values = np.fromiter(arr._pa_array, dtype="O") else: values = np.asarray(arr) - if "pyarrow" in arr_dtype: - storage = "pyarrow" if "python" in arr_dtype: storage = "python" if arr._hasna: From d344841216d7dd417f53d17c3fd3cbb955cf1f66 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:00:19 +0000 Subject: [PATCH 126/163] Add method _maybe_convert_pyarrow_objects --- pandas/_libs/lib.pyx | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 994fdf84b51db..05438daeb883e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2451,6 +2451,20 @@ def maybe_convert_numeric( return (ints, None) +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_convert_pyarrow_objects( + ndarray[object] objects, + ndarray[uint8_t] mask, + Seen seen) -> "ArrayLike": + from pandas.core.dtypes.dtypes import ArrowDtype + + objects[mask] = None + pa_array = pa.array(objects) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2669,6 +2683,8 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + if storage == "pyarrow": + return _maybe_convert_pyarrow_objects(objects, mask, seen) # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: From 0b56b9c87ae6f6bb0f2272400305c53540e02b09 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:00:50 +0000 Subject: [PATCH 127/163] Remove check_dtype --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1d0bb5c080015..5dad77b1a10f5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -288,7 +288,7 @@ def test_map(self, data_missing, na_action): else: result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype From 93bb8d7eda87c21aeb34c8766f86e1d3c1a7f37f Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:01:36 +0000 Subject: [PATCH 128/163] Code simplification --- pandas/core/arrays/arrow/array.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d5f0bbbd85891..75715240c9bd0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -61,7 +61,6 @@ from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com -from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -1431,11 +1430,7 @@ def to_numpy( return result def map(self, mapper, na_action: Literal["ignore"] | None = None): - result = super().map(mapper, na_action) - if isinstance(result.dtype, StringDtype): - return result - else: - return pd_array(result, dtype=result.dtype) + return super().map(mapper, na_action) @doc(ExtensionArray.duplicated) def duplicated( From bc04ec72e42ccd911da5e2f65454ab5936d41e4e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:19:03 +0000 Subject: [PATCH 129/163] Manage default storage value --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 05438daeb883e..087f90b1f22e2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2797,7 +2797,7 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif storage == "pyarrow" or storage == "python": + elif storage is None or storage == "python": from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage=storage) From a03357ec4a6e7f8960dfd0a16aaf6a13a67b0c12 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 2 Aug 2024 09:39:55 +0000 Subject: [PATCH 130/163] ord(x) return a TypeError if x is a pyarrow.lib.LargeStringScalar --- pandas/tests/reshape/merge/test_multi.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 7ae2fffa04205..45d36ae886d96 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -98,8 +98,14 @@ def test_left_join_multi_index(self, sort, infer_string): with option_context("future.infer_string", infer_string): icols = ["1st", "2nd", "3rd"] + def ord_func(x): + if infer_string: + # ord(x) return a TypeError if x is a pyarrow.lib.LargeStringScalar + return ord(str(x)) + return ord(x) + def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) + iord = lambda a: 0 if a != a else ord_func(a) f = lambda ts: ts.map(iord) - ord("a") return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 From 646a85dd1162cade9f044e13d6abeba8844fb6a2 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 3 Aug 2024 06:42:05 +0000 Subject: [PATCH 131/163] Manage str.encode for pyarrow.lib.LargeStringScalar --- pandas/core/strings/object_array.py | 11 +++++++++-- pandas/tests/strings/test_strings.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 290a28ab60ae1..9d4aed176f0fa 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -239,8 +239,15 @@ def _str_fullmatch( return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_encode(self, encoding, errors: str = "strict"): - f = lambda x: x.encode(encoding, errors=errors) - return self._str_map(f, dtype=object) + def encode_func(x): + if x is str: + return x.encode(encoding=encoding, errors=errors) + else: + # Manage AttributeError: 'pyarrow.lib.LargeStringScalar' + # object has no attribute 'encode' + return str(x).encode(encoding=encoding, errors=errors) + + return self._str_map(encode_func, dtype=object) def _str_find(self, sub, start: int = 0, end=None): return self._str_find_(sub, start, end, side="left") diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..c8acb936e3d2c 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -561,7 +561,16 @@ def test_encode_errors_kwarg(any_string_dtype): ser.str.encode("cp1252") result = ser.str.encode("cp1252", "ignore") - expected = ser.map(lambda x: x.encode("cp1252", "ignore")) + + def encode_func(x): + if x is str: + return x.encode("cp1252", "ignore") + else: + # Manage AttributeError: 'pyarrow.lib.LargeStringScalar' + # object has no attribute 'encode' + return str(x).encode("cp1252", "ignore") + + expected = ser.map(encode_func).astype("object") tm.assert_series_equal(result, expected) From b4adcadc1a02dca4b08bb7c03e86a58e3fd14dbf Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 4 Aug 2024 07:18:29 +0000 Subject: [PATCH 132/163] Manage string convertible to nullable dtype --- pandas/_libs/lib.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 087f90b1f22e2..6771c310cf5e1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2797,7 +2797,10 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif storage is None or storage == "python": + elif ( + (convert_to_nullable_dtype and is_string_array(objects, skipna=True)) + or storage == "python" + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage=storage) From efc2600014c3cbb3c1e7c98b43805044272c204f Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 5 Aug 2024 07:35:06 +0000 Subject: [PATCH 133/163] Manage Based masked dtype --- pandas/_libs/lib.pyx | 19 ++++++++++++++++++- pandas/tests/extension/test_masked.py | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6771c310cf5e1..acbee71af4ba8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2465,6 +2465,20 @@ def _maybe_convert_pyarrow_objects( return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_convert_based_masked( + ndarray[object] objects, + ndarray[uint8_t] mask, + object type) -> "ArrayLike": + from pandas.core.dtypes.dtypes import BaseMaskedDtype + + from pandas.core.construction import array as pd_array + + dtype = BaseMaskedDtype.from_numpy_dtype(np.dtype(type)) + return pd_array(objects, dtype=dtype) + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2683,7 +2697,10 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break - if storage == "pyarrow": + + if type(val) is not object and convert_to_nullable_dtype: + return _maybe_convert_based_masked(objects, mask, type(val)) + elif storage == "pyarrow": return _maybe_convert_pyarrow_objects(objects, mask, seen) # we try to coerce datetime w/tz but must all have the same tz diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index abf73333650a5..6f76ca2cabf0b 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -172,7 +172,7 @@ class TestMaskedArrays(base.ExtensionTests): def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] From d4b5396a6c9c4fd5281e5f9a2a420a32b942d438 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 5 Aug 2024 07:43:12 +0000 Subject: [PATCH 134/163] Code clean up --- pandas/core/arrays/masked.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d3ee05c6b4611..483ddcdccf274 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1319,8 +1319,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - result = map_array(self, mapper, na_action=na_action) - return pd_array(result, dtype=result.dtype) + return map_array(self, mapper, na_action=na_action) @overload def any( From 5c1c7266fdb56eed71f7541eaaf8cb6a419c2982 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:33:17 +0000 Subject: [PATCH 135/163] Code simplification --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 6ec847536647e..46914bafae599 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -101,7 +101,7 @@ def test_apply_simple_series(self, data): def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() From 7c6fdb27d580a069bd11ea1ada4b3e1e8692d3c7 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:38:19 +0000 Subject: [PATCH 136/163] Manage pyarrow string --- pandas/_libs/lib.pyx | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index acbee71af4ba8..3108997ecc964 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2453,29 +2453,37 @@ def maybe_convert_numeric( @cython.boundscheck(False) @cython.wraparound(False) -def _maybe_convert_pyarrow_objects( - ndarray[object] objects, - ndarray[uint8_t] mask, - Seen seen) -> "ArrayLike": +def _convert_to_pyarrow( + ndarray[object] objects, + ndarray[uint8_t] mask) -> "ArrayLike": from pandas.core.dtypes.dtypes import ArrowDtype + from pandas.core.arrays.string_ import StringDtype + + na_value = None + if mask is not None and any(mask): + na_value = objects[mask][0] + objects[mask] = None pa_array = pa.array(objects) - dtype = ArrowDtype(pa_array.type) + + if pa.types.is_large_string(pa_array.type): + dtype = StringDtype(storage="pyarrow", na_value=na_value) + else: + dtype = ArrowDtype(pa_array.type) return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) @cython.boundscheck(False) @cython.wraparound(False) -def _maybe_convert_based_masked( - ndarray[object] objects, - ndarray[uint8_t] mask, - object type) -> "ArrayLike": +def _convert_to_based_masked( + ndarray[object] objects, + object numpy_dtype) -> "ArrayLike": from pandas.core.dtypes.dtypes import BaseMaskedDtype from pandas.core.construction import array as pd_array - dtype = BaseMaskedDtype.from_numpy_dtype(np.dtype(type)) + dtype = BaseMaskedDtype.from_numpy_dtype(numpy_dtype) return pd_array(objects, dtype=dtype) @@ -2552,6 +2560,7 @@ def maybe_convert_objects(ndarray[object] objects, uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0) bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) mask = np.full(n, False) + val = None for i in range(n): val = objects[i] @@ -2698,10 +2707,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break - if type(val) is not object and convert_to_nullable_dtype: - return _maybe_convert_based_masked(objects, mask, type(val)) + numpy_dtype = np.dtype(type(val)) + if numpy_dtype.kind in "biuf" and convert_to_nullable_dtype: + return _convert_to_based_masked(objects, numpy_dtype) elif storage == "pyarrow": - return _maybe_convert_pyarrow_objects(objects, mask, seen) + return _convert_to_pyarrow(objects, mask) # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: From 8f18e41ee381d647a681e9cf359a1087bfd6e9ad Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 9 Aug 2024 05:45:36 +0000 Subject: [PATCH 137/163] Manage json and decimal extension array --- pandas/core/arrays/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0cbefe1b05c31..29d7de507c1fd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2350,7 +2350,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): else: return result elif isinstance(result, np.ndarray): - return pd_array(result, result.dtype) + result_types = set(np.array([type(x) for x in result])) + + # if internal values types are compatible with self dtype + if all(issubclass(t, self.dtype.type) for t in result_types): + return pd_array(result, self.dtype) + else: + return pd_array(result, result.dtype) else: return result From 1945ce61d275378230bae05dc621adc40fa8b40d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 9 Aug 2024 05:46:16 +0000 Subject: [PATCH 138/163] Manage na_value in python string --- pandas/_libs/lib.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3108997ecc964..d1c81d9e41e25 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2830,7 +2830,11 @@ def maybe_convert_objects(ndarray[object] objects, ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage=storage) + na_value = None + if mask is not None and any(mask): + na_value = objects[mask][0] + + dtype = StringDtype(storage=storage, na_value=na_value) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From e2f2482363ba8fa97e7c40087f558e66a9aaa790 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 9 Aug 2024 06:56:45 +0000 Subject: [PATCH 139/163] Cast to BasedMasked is limited to array containing one type --- pandas/_libs/lib.pyx | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d1c81d9e41e25..7105bb3c67347 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2561,9 +2561,13 @@ def maybe_convert_objects(ndarray[object] objects, bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) mask = np.full(n, False) val = None + val_types = set() for i in range(n): val = objects[i] + if not checknull(val): + val_types.add(type(val)) + if itemsize_max != -1: itemsize = get_itemsize(val) if itemsize > itemsize_max or itemsize == -1: @@ -2708,7 +2712,10 @@ def maybe_convert_objects(ndarray[object] objects, break numpy_dtype = np.dtype(type(val)) - if numpy_dtype.kind in "biuf" and convert_to_nullable_dtype: + if ( + numpy_dtype.kind in "biuf" + and len(val_types) == 1 + and convert_to_nullable_dtype): return _convert_to_based_masked(objects, numpy_dtype) elif storage == "pyarrow": return _convert_to_pyarrow(objects, mask) @@ -2830,11 +2837,10 @@ def maybe_convert_objects(ndarray[object] objects, ): from pandas.core.arrays.string_ import StringDtype - na_value = None if mask is not None and any(mask): - na_value = objects[mask][0] - - dtype = StringDtype(storage=storage, na_value=na_value) + dtype = StringDtype(storage=storage, na_value=objects[mask][0]) + else: + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From a5d3b74b140e4da83badb02b3ee4d107f504fcdc Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 09:39:22 +0000 Subject: [PATCH 140/163] numpy dtype is extracted from the identified types in object --- pandas/_libs/lib.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7105bb3c67347..d63a2179be502 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2460,7 +2460,7 @@ def _convert_to_pyarrow( from pandas.core.arrays.string_ import StringDtype - na_value = None + na_value = np.nan if mask is not None and any(mask): na_value = objects[mask][0] @@ -2711,13 +2711,15 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break - numpy_dtype = np.dtype(type(val)) - if ( - numpy_dtype.kind in "biuf" - and len(val_types) == 1 - and convert_to_nullable_dtype): - return _convert_to_based_masked(objects, numpy_dtype) - elif storage == "pyarrow": + numpy_dtype = None + if len(val_types) == 1: + numpy_dtype = np.dtype(val_types.pop()) + if ( + numpy_dtype.kind in "biuf" + and len(val_types) == 1 + and convert_to_nullable_dtype): + return _convert_to_based_masked(objects, numpy_dtype) + if storage == "pyarrow": return _convert_to_pyarrow(objects, mask) # we try to coerce datetime w/tz but must all have the same tz From e6d9f48c80164f7107f1a7008648423feac8b76b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 09:39:48 +0000 Subject: [PATCH 141/163] Correct typo in exception --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2ba7c9fccbfce..421fec23497c5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -167,7 +167,7 @@ def __init__( # a consistent NaN value (and we can use `dtype.na_value is np.nan`) na_value = np.nan elif na_value is not libmissing.NA: - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") self.storage = storage self._na_value = na_value From 3447e1a4f1aa9ec30097869ddd7fc2f8eefd4d4b Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:07:59 +0000 Subject: [PATCH 142/163] Correct typo in based masked array conversion --- pandas/_libs/lib.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d63a2179be502..6b1a4aa03181f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2716,7 +2716,6 @@ def maybe_convert_objects(ndarray[object] objects, numpy_dtype = np.dtype(val_types.pop()) if ( numpy_dtype.kind in "biuf" - and len(val_types) == 1 and convert_to_nullable_dtype): return _convert_to_based_masked(objects, numpy_dtype) if storage == "pyarrow": From 6f0beb6826c579d418d0dea502971b37c4662759 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:14:59 +0000 Subject: [PATCH 143/163] Remove check_dtype filter for tests --- pandas/tests/extension/test_masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 6f76ca2cabf0b..672c1ec7a194d 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -181,7 +181,7 @@ def test_map_na_action_ignore(self, data_missing_for_sorting): expected = type(data_missing_for_sorting)._from_sequence( [zero, na_value, zero], dtype=data_missing_for_sorting.dtype ) - tm.assert_extension_array_equal(result, expected, check_dtype=False) + tm.assert_extension_array_equal(result, expected) def _get_expected_exception(self, op_name, obj, other): try: From d47c9b630dbfae3f3362fda63bacae60f81d7f3d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 19 Aug 2024 07:30:39 +0000 Subject: [PATCH 144/163] Resolve merge --- pandas/_libs/lib.pyx | 245 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 207 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index fb724591df93d..607e1e99e1022 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -96,6 +96,10 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) + +from pandas._libs.tslibs.timedeltas import Timedelta +from pandas._libs.tslibs.timestamps import Timestamp + from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -1300,6 +1304,7 @@ cdef class Seen: bint object_ # seen_object bint complex_ # seen_complex bint datetime_ # seen_datetime + bint date_ # seen_date bint coerce_numeric # coerce data to numeric bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz @@ -1328,6 +1333,7 @@ cdef class Seen: self.object_ = False self.complex_ = False self.datetime_ = False + self.date_ = False self.timedelta_ = False self.datetimetz_ = False self.period_ = False @@ -2445,6 +2451,42 @@ def maybe_convert_numeric( return (ints, None) +@cython.boundscheck(False) +@cython.wraparound(False) +def _convert_to_pyarrow( + ndarray[object] objects, + ndarray[uint8_t] mask) -> "ArrayLike": + from pandas.core.dtypes.dtypes import ArrowDtype + + from pandas.core.arrays.string_ import StringDtype + + na_value = np.nan + if mask is not None and any(mask): + na_value = objects[mask][0] + + objects[mask] = None + pa_array = pa.array(objects) + + if pa.types.is_large_string(pa_array.type): + dtype = StringDtype(storage="pyarrow", na_value=na_value) + else: + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + +@cython.boundscheck(False) +@cython.wraparound(False) +def _convert_to_based_masked( + ndarray[object] objects, + object numpy_dtype) -> "ArrayLike": + from pandas.core.dtypes.dtypes import BaseMaskedDtype + + from pandas.core.construction import array as pd_array + + dtype = BaseMaskedDtype.from_numpy_dtype(numpy_dtype) + return pd_array(objects, dtype=dtype) + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2454,7 +2496,8 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, - object dtype_if_all_nat=None) -> "ArrayLike": + object dtype_if_all_nat=None, + str storage=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2514,9 +2557,14 @@ def maybe_convert_objects(ndarray[object] objects, uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0) bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) mask = np.full(n, False) + val = None + val_types = set() for i in range(n): val = objects[i] + if not checknull(val): + val_types.add(type(val)) + if itemsize_max != -1: itemsize = get_itemsize(val) if itemsize > itemsize_max or itemsize == -1: @@ -2609,6 +2657,16 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif ( + PyDate_Check(val) + or (pa is not None and isinstance(val, (pa.Date32Scalar, pa.Date64Scalar))) + ): + if convert_non_numeric: + seen.date_ = True + break + else: + seen.object_ = True + break elif is_period_object(val): if convert_non_numeric: seen.period_ = True @@ -2650,23 +2708,62 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break + numpy_dtype = None + if len(val_types) == 1: + numpy_dtype = np.dtype(val_types.pop()) + if ( + numpy_dtype.kind in "biuf" + and convert_to_nullable_dtype): + return _convert_to_based_masked(objects, numpy_dtype) + if storage == "pyarrow": + return _convert_to_pyarrow(objects, mask) + # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if is_datetime_with_singletz_array(objects): - from pandas import DatetimeIndex + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype - try: - dti = DatetimeIndex(objects) - except OutOfBoundsDatetime: - # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds - pass + datetime64_array = None + if isinstance(val, datetime): + objects[mask] = None + datetime64_array = objects.astype(Timestamp) else: - # unbox to DatetimeArray - return dti._data - seen.object_ = True + objects[mask] = np.datetime64("NaT") + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array).cast( + pa.timestamp(val.resolution.unit, val.tzinfo) + ) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + else: + if is_datetime_with_singletz_array(objects): + from pandas import DatetimeIndex + + try: + dti = DatetimeIndex(objects) + except OutOfBoundsDatetime: + # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds + pass + else: + # unbox to DatetimeArray + return dti._data + seen.object_ = True elif seen.datetime_: - if is_datetime_or_datetime64_array(objects): + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype + + if isinstance(val, datetime): + objects[mask] = None + else: + objects[mask] = np.datetime64("NaT") + datetime64_array = objects.astype(val.dtype) + pa_array = pa.array(datetime64_array) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + elif is_datetime_or_datetime64_array(objects): from pandas import DatetimeIndex try: @@ -2678,18 +2775,43 @@ def maybe_convert_objects(ndarray[object] objects, return dti._data._ndarray seen.object_ = True + elif seen.date_: + if storage == "pyarrow": + + from pandas.core.dtypes.dtypes import ArrowDtype + + objects[mask] = None + pa_array = pa.array(objects) + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + elif seen.timedelta_: - if is_timedelta_or_timedelta64_array(objects): - from pandas import TimedeltaIndex + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype - try: - tdi = TimedeltaIndex(objects) - except OutOfBoundsTimedelta: - pass + timedelta64_array = None + if isinstance(val, timedelta): + objects[mask] = None + timedelta64_array = objects.astype(Timedelta) else: - # unbox to ndarray[timedelta64[ns]] - return tdi._data._ndarray - seen.object_ = True + objects[mask] = np.timedelta64("NaT") + timedelta64_array = objects.astype(val.dtype) + pa_array = pa.array(timedelta64_array) + + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + else: + if is_timedelta_or_timedelta64_array(objects): + from pandas import TimedeltaIndex + + try: + tdi = TimedeltaIndex(objects) + except OutOfBoundsTimedelta: + pass + else: + # unbox to ndarray[timedelta64[ns]] + return tdi._data._ndarray + seen.object_ = True elif seen.period_: if is_period_array(objects): @@ -2707,10 +2829,16 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif ( + (convert_to_nullable_dtype and is_string_array(objects, skipna=True)) + or storage == "python" + ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + if mask is not None and any(mask): + dtype = StringDtype(storage=storage, na_value=objects[mask][0]) + else: + dtype = StringDtype(storage=storage) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True @@ -2817,12 +2945,19 @@ def maybe_convert_objects(ndarray[object] objects, # TODO: do these after the itemsize check? if (result is ints or result is uints) and convert_to_nullable_dtype: - from pandas.core.arrays import IntegerArray + if storage == "pyarrow": + from pandas.core.dtypes.dtypes import ArrowDtype + + dtype = ArrowDtype(pa.int64()) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + else: + from pandas.core.arrays import IntegerArray + + # Set these values to 1 to be deterministic, match + # IntegerDtype._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) - # Set these values to 1 to be deterministic, match - # IntegerDtype._internal_fill_value - result[mask] = 1 - result = IntegerArray(result, mask) elif result is floats and convert_to_nullable_dtype: from pandas.core.arrays import FloatingArray @@ -2866,7 +3001,10 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, - cnp.dtype dtype=np.dtype(object) + bint convert_to_nullable_dtype=False, + convert_non_numeric=False, + cnp.dtype dtype=np.dtype(object), + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2882,8 +3020,12 @@ def map_infer_mask( na_value : Any, optional The result value to use for masked values. By default, the input value is used. + convert_non_numeric : bool, default False + Whether to convert datetime, timedelta, period, interval types. dtype : numpy.dtype The numpy dtype to use for the result ndarray. + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + Backend storage Returns ------- @@ -2896,17 +3038,16 @@ def map_infer_mask( ndarray result = np.empty(n, dtype=dtype) - flatiter arr_it = PyArray_IterNew(arr) flatiter result_it = PyArray_IterNew(result) for i in range(n): if mask[i]: if na_value is no_default: - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = arr[i] else: val = na_value else: - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = arr[i] val = f(val) if cnp.PyArray_IsZeroDim(val): @@ -2914,12 +3055,15 @@ def map_infer_mask( val = val.item() PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) - - PyArray_ITER_NEXT(arr_it) PyArray_ITER_NEXT(result_it) if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=convert_non_numeric, + storage=storage, + ) else: return result @@ -2927,7 +3071,15 @@ def map_infer_mask( @cython.boundscheck(False) @cython.wraparound(False) def map_infer( - ndarray arr, object f, *, bint convert=True, bint ignore_na=False + ndarray arr, + object f, + *, + bint convert=True, + bint ignore_na=False, + const uint8_t[:] mask=None, + object na_value=None, + bint convert_to_nullable_dtype=False, + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2939,6 +3091,15 @@ def map_infer( convert : bint ignore_na : bint If True, NA values will not have f applied + mask : ndarray, optional + uint8 dtype ndarray indicating na_value to apply `f` to. + na_value : Any, optional + The input value to use for masked values. + convert_to_nullable_dtype : bool, default False + If an array-like object contains only integer or boolean values (and NaN) is + encountered, whether to convert and return an Boolean/IntegerArray. + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + Backend storage Returns ------- @@ -2955,7 +3116,10 @@ def map_infer( if ignore_na and checknull(arr[i]): result[i] = arr[i] continue - val = f(arr[i]) + elif mask is not None and na_value is not None and mask[i]: + val = f(na_value) + else: + val = f(arr[i]) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 @@ -2964,7 +3128,12 @@ def map_infer( result[i] = val if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=True, + storage=storage, + ) else: return result From a9cce25be60c83707789ed980c34e81b24a1e9be Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 19 Aug 2024 07:32:55 +0000 Subject: [PATCH 145/163] Resolve merge --- pandas/core/algorithms.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3a0ddf2a0b944..8478cdcef7952 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1761,14 +1761,20 @@ def _build_map_infer_methods_params(arr: ArrayLike): na_value = arr.dtype.na_value elif isinstance(arr.dtype, ExtensionDtype): + from pandas.core.arrays.string_ import StringDtype + arr = cast("ExtensionArray", arr) arr_dtype = arr.dtype.__repr__() - if "pyarrow" in arr_dtype: + if ( + isinstance(arr.dtype, StringDtype) and arr.dtype.storage == "pyarrow" + ) or "pyarrow" in arr_dtype: storage = "pyarrow" values = np.fromiter(arr._pa_array, dtype="O") else: values = np.asarray(arr) - if "python" in arr_dtype: + if ( + isinstance(arr.dtype, StringDtype) and arr.dtype.storage == "python" + ) or "python" in arr_dtype: storage = "python" if arr._hasna: na_value = arr.dtype.na_value From c6733389e8f4365fbbe4c2f4a87915ff2421bbf6 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 19 Aug 2024 08:02:11 +0000 Subject: [PATCH 146/163] Resolve merge problem --- pandas/core/arrays/masked.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 483ddcdccf274..8659c21730795 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,7 +3,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -73,6 +72,7 @@ from pandas.core.util.hashing import hash_array if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Iterator, Sequence, @@ -1198,7 +1198,7 @@ def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" - if name in ["mean", "median", "var", "std", "skew", "kurt"]: + if name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name From 43b6b5ac9d5f582acfa6cda02720f2f093efc697 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:01:54 +0000 Subject: [PATCH 147/163] take into account unsigned type --- pandas/_libs/lib.pyx | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 607e1e99e1022..acfd09b2475d6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2487,6 +2487,21 @@ def _convert_to_based_masked( return pd_array(objects, dtype=dtype) +@cython.boundscheck(False) +@cython.wraparound(False) +def _seen_to_numpy_dtype(Seen seen): + if seen.bool_: + return np.dtype(bool) + elif seen.uint_: + return np.dtype(np.uint) + elif seen.int_ or seen.sint_: + return np.dtype(int) + elif seen.float_: + return np.dtype(float) + else: + return None + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2710,9 +2725,9 @@ def maybe_convert_objects(ndarray[object] objects, numpy_dtype = None if len(val_types) == 1: - numpy_dtype = np.dtype(val_types.pop()) + numpy_dtype = _seen_to_numpy_dtype(seen) if ( - numpy_dtype.kind in "biuf" + numpy_dtype and numpy_dtype.kind in "biuf" and convert_to_nullable_dtype): return _convert_to_based_masked(objects, numpy_dtype) if storage == "pyarrow": From 2801cc05a050dbfa505cfb2a8ffc6542bccbcb04 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 23 Aug 2024 05:50:11 +0000 Subject: [PATCH 148/163] manage native python type and numpy scalar type --- pandas/_libs/lib.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 61069d0785d42..a61597d905106 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,8 +37,6 @@ from cython cimport ( floating, ) -from pandas._config import using_string_dtype - from pandas._libs.missing import check_na_tuples_nonequal import_datetime() @@ -2489,8 +2487,12 @@ def _convert_to_based_masked( @cython.boundscheck(False) @cython.wraparound(False) -def _seen_to_numpy_dtype(Seen seen): - if seen.bool_: +def _seen_to_numpy_dtype(Seen seen, object scalar_type): + # Numpy scalar type + if issubclass(scalar_type, np.generic): + return np.dtype(scalar_type) + # Native python type + elif seen.bool_: return np.dtype(bool) elif seen.uint_: return np.dtype(np.uint) @@ -2723,15 +2725,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break + if storage == "pyarrow": + return _convert_to_pyarrow(objects, mask) + numpy_dtype = None if len(val_types) == 1: - numpy_dtype = _seen_to_numpy_dtype(seen) + numpy_dtype = _seen_to_numpy_dtype(seen, val_types.pop()) if ( numpy_dtype and numpy_dtype.kind in "biuf" and convert_to_nullable_dtype): return _convert_to_based_masked(objects, numpy_dtype) - if storage == "pyarrow": - return _convert_to_pyarrow(objects, mask) # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: From 04df901802d9b0de8177d359e2840e7a243b5402 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 23 Aug 2024 07:10:07 +0000 Subject: [PATCH 149/163] correct merge problem --- pandas/_libs/lib.pyx | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a61597d905106..c196c3daab84a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,6 +37,8 @@ from cython cimport ( floating, ) +from pandas._config import using_string_dtype + from pandas._libs.missing import check_na_tuples_nonequal import_datetime() @@ -2841,15 +2843,18 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if convert_to_nullable_dtype and is_string_array(objects, skipna=True): + if using_string_dtype() and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype(na_value=np.nan) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + + elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif ( - (convert_to_nullable_dtype and is_string_array(objects, skipna=True)) - or storage == "python" - ): + elif storage == "python": from pandas.core.arrays.string_ import StringDtype if mask is not None and any(mask): From 7bbc88b77b4b9c8d101dd4e09aa85974844628de Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 24 Aug 2024 07:34:30 +0000 Subject: [PATCH 150/163] take into account NaT value --- pandas/_libs/lib.pyx | 2 +- pandas/tests/dtypes/test_inference.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c196c3daab84a..f40cd2c66caa0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1790,7 +1790,7 @@ cdef class Validator: "must define is_value_typed") cdef bint is_valid_null(self, object value) except -1: - return value is None or value is C_NA or util.is_nan(value) + return value is None or value is C_NA or util.is_nan(value) or value is NaT # TODO: include decimal NA? cdef bint is_array_typed(self) except -1: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b1d7c701e1267..f656af7f658f3 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -945,9 +945,9 @@ def test_maybe_convert_objects_nullable_boolean(self): tm.assert_extension_array_equal(out, exp) arr = np.array([True, False, pd.NaT], dtype=object) - exp = np.array([True, False, pd.NaT], dtype=object) + exp = BooleanArray._from_sequence([True, False, pd.NaT], dtype="boolean") out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) - tm.assert_numpy_array_equal(out, exp) + tm.assert_extension_array_equal(out, exp) @pytest.mark.parametrize("val", [None, np.nan]) def test_maybe_convert_objects_nullable_boolean_na(self, val): From 3d3e4730369f412c30949f8cd6da3502ce2af5b3 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 24 Aug 2024 07:47:24 +0000 Subject: [PATCH 151/163] code clean up: pyarrow management simplification in maybe_convert_objects --- pandas/_libs/lib.pyx | 75 +++++++++----------------------------------- 1 file changed, 15 insertions(+), 60 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f40cd2c66caa0..3b37cb1a74bf6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -97,7 +97,6 @@ from pandas._libs.missing cimport ( is_null_timedelta64, ) -from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.conversion cimport convert_to_tsobject @@ -2771,19 +2770,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.datetime_: - if storage == "pyarrow": - from pandas.core.dtypes.dtypes import ArrowDtype - - if isinstance(val, datetime): - objects[mask] = None - else: - objects[mask] = np.datetime64("NaT") - datetime64_array = objects.astype(val.dtype) - pa_array = pa.array(datetime64_array) - dtype = ArrowDtype(pa_array.type) - return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) - - elif is_datetime_or_datetime64_array(objects): + if is_datetime_or_datetime64_array(objects): from pandas import DatetimeIndex try: @@ -2795,43 +2782,18 @@ def maybe_convert_objects(ndarray[object] objects, return dti._data._ndarray seen.object_ = True - elif seen.date_: - if storage == "pyarrow": - - from pandas.core.dtypes.dtypes import ArrowDtype - - objects[mask] = None - pa_array = pa.array(objects) - dtype = ArrowDtype(pa_array.type) - return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) - elif seen.timedelta_: - if storage == "pyarrow": - from pandas.core.dtypes.dtypes import ArrowDtype + if is_timedelta_or_timedelta64_array(objects): + from pandas import TimedeltaIndex - timedelta64_array = None - if isinstance(val, timedelta): - objects[mask] = None - timedelta64_array = objects.astype(Timedelta) + try: + tdi = TimedeltaIndex(objects) + except OutOfBoundsTimedelta: + pass else: - objects[mask] = np.timedelta64("NaT") - timedelta64_array = objects.astype(val.dtype) - pa_array = pa.array(timedelta64_array) - - dtype = ArrowDtype(pa_array.type) - return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) - else: - if is_timedelta_or_timedelta64_array(objects): - from pandas import TimedeltaIndex - - try: - tdi = TimedeltaIndex(objects) - except OutOfBoundsTimedelta: - pass - else: - # unbox to ndarray[timedelta64[ns]] - return tdi._data._ndarray - seen.object_ = True + # unbox to ndarray[timedelta64[ns]] + return tdi._data._ndarray + seen.object_ = True elif seen.period_: if is_period_array(objects): @@ -2967,19 +2929,12 @@ def maybe_convert_objects(ndarray[object] objects, # TODO: do these after the itemsize check? if (result is ints or result is uints) and convert_to_nullable_dtype: - if storage == "pyarrow": - from pandas.core.dtypes.dtypes import ArrowDtype - - dtype = ArrowDtype(pa.int64()) - return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - else: - from pandas.core.arrays import IntegerArray - - # Set these values to 1 to be deterministic, match - # IntegerDtype._internal_fill_value - result[mask] = 1 - result = IntegerArray(result, mask) + from pandas.core.arrays import IntegerArray + # Set these values to 1 to be deterministic, match + # IntegerDtype._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) elif result is floats and convert_to_nullable_dtype: from pandas.core.arrays import FloatingArray From 183b6e67ae1c53b3a7c99c093da6e895c7e5a0c1 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 24 Aug 2024 08:39:41 +0000 Subject: [PATCH 152/163] code clean up: pyarrow management simplification in maybe_convert_objects --- pandas/_libs/lib.pyx | 52 +++++++++----------------------------------- 1 file changed, 10 insertions(+), 42 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3b37cb1a74bf6..5896e5a452695 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -96,9 +96,6 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) - -from pandas._libs.tslibs.timestamps import Timestamp - from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -1303,7 +1300,6 @@ cdef class Seen: bint object_ # seen_object bint complex_ # seen_complex bint datetime_ # seen_datetime - bint date_ # seen_date bint coerce_numeric # coerce data to numeric bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz @@ -1332,7 +1328,6 @@ cdef class Seen: self.object_ = False self.complex_ = False self.datetime_ = False - self.date_ = False self.timedelta_ = False self.datetimetz_ = False self.period_ = False @@ -2675,16 +2670,6 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break - elif ( - PyDate_Check(val) - or (pa is not None and isinstance(val, (pa.Date32Scalar, pa.Date64Scalar))) - ): - if convert_non_numeric: - seen.date_ = True - break - else: - seen.object_ = True - break elif is_period_object(val): if convert_non_numeric: seen.period_ = True @@ -2739,35 +2724,18 @@ def maybe_convert_objects(ndarray[object] objects, # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if storage == "pyarrow": - from pandas.core.dtypes.dtypes import ArrowDtype + if is_datetime_with_singletz_array(objects): + from pandas import DatetimeIndex - datetime64_array = None - if isinstance(val, datetime): - objects[mask] = None - datetime64_array = objects.astype(Timestamp) + try: + dti = DatetimeIndex(objects) + except OutOfBoundsDatetime: + # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds + pass else: - objects[mask] = np.datetime64("NaT") - datetime64_array = objects.astype(val.dtype) - pa_array = pa.array(datetime64_array).cast( - pa.timestamp(val.resolution.unit, val.tzinfo) - ) - dtype = ArrowDtype(pa_array.type) - return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) - - else: - if is_datetime_with_singletz_array(objects): - from pandas import DatetimeIndex - - try: - dti = DatetimeIndex(objects) - except OutOfBoundsDatetime: - # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds - pass - else: - # unbox to DatetimeArray - return dti._data - seen.object_ = True + # unbox to DatetimeArray + return dti._data + seen.object_ = True elif seen.datetime_: if is_datetime_or_datetime64_array(objects): From 25a9d3507d4c9cdb031f988359a843932cb09257 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 31 Aug 2024 08:36:14 +0000 Subject: [PATCH 153/163] BooleanArray does not support pd.NaT --- pandas/_libs/lib.pyx | 2 +- pandas/tests/dtypes/test_inference.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5896e5a452695..e37e2739c78f4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1784,7 +1784,7 @@ cdef class Validator: "must define is_value_typed") cdef bint is_valid_null(self, object value) except -1: - return value is None or value is C_NA or util.is_nan(value) or value is NaT + return value is None or value is C_NA or util.is_nan(value) # TODO: include decimal NA? cdef bint is_array_typed(self) except -1: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 794c9cf6c60cc..76e5d33d90038 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -944,8 +944,9 @@ def test_maybe_convert_objects_nullable_boolean(self): out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(out, exp) - arr = np.array([True, False, pd.NaT], dtype=object) - exp = BooleanArray._from_sequence([True, False, pd.NaT], dtype="boolean") + # pd.NaT are not supported in BooleanArray, but pd.NA are supported + arr = np.array([True, False, pd.NA], dtype=object) + exp = BooleanArray._from_sequence([True, False, pd.NA], dtype="boolean") out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(out, exp) From c10a2445f45149a539d0876ee477b8642aeb7ddf Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 31 Aug 2024 08:58:40 +0000 Subject: [PATCH 154/163] code clean up --- pandas/_libs/lib.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e37e2739c78f4..565d594eceedb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2773,17 +2773,18 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(na_value=np.nan) + dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + elif storage == "python": from pandas.core.arrays.string_ import StringDtype From ebb8d2680888cce7c7337717c6c1f18382b99d21 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sat, 31 Aug 2024 09:40:09 +0000 Subject: [PATCH 155/163] code clean up: _convert_to_pyarrow simplification --- pandas/_libs/lib.pyi | 3 +++ pandas/_libs/lib.pyx | 17 ++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index eaf56c5ab652d..48e828cf0824a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -102,6 +102,7 @@ def maybe_convert_objects( convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., storage: str | None = ..., + na_value: Any = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload def maybe_convert_objects( @@ -114,6 +115,7 @@ def maybe_convert_objects( convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., storage: str | None = ..., + na_value: Any = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -126,6 +128,7 @@ def maybe_convert_objects( convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., storage: str | None = ..., + na_value: Any = ..., ) -> ArrayLike: ... @overload def maybe_convert_numeric( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 565d594eceedb..d3ab2347c1a05 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2449,18 +2449,18 @@ def maybe_convert_numeric( @cython.wraparound(False) def _convert_to_pyarrow( ndarray[object] objects, - ndarray[uint8_t] mask) -> "ArrayLike": + ndarray[uint8_t] mask, + object na_value=None) -> "ArrayLike": from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.arrays.string_ import StringDtype - na_value = np.nan - if mask is not None and any(mask): - na_value = objects[mask][0] - + # pa.array does not support na_value as pd.NA, + # so we replace them by None and then restore them after objects[mask] = None pa_array = pa.array(objects) + # Pyarrow large string are StringDtype (not ArrowDtype) if pa.types.is_large_string(pa_array.type): dtype = StringDtype(storage="pyarrow", na_value=na_value) else: @@ -2510,7 +2510,8 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, object dtype_if_all_nat=None, - str storage=None) -> "ArrayLike": + str storage=None, + object na_value=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2712,7 +2713,7 @@ def maybe_convert_objects(ndarray[object] objects, break if storage == "pyarrow": - return _convert_to_pyarrow(objects, mask) + return _convert_to_pyarrow(objects, mask, na_value) numpy_dtype = None if len(val_types) == 1: @@ -3009,6 +3010,7 @@ def map_infer_mask( convert_to_nullable_dtype=convert_to_nullable_dtype, convert_non_numeric=convert_non_numeric, storage=storage, + na_value=na_value, ) else: return result @@ -3079,6 +3081,7 @@ def map_infer( convert_to_nullable_dtype=convert_to_nullable_dtype, convert_non_numeric=True, storage=storage, + na_value=na_value, ) else: return result From 53a885c39f27e4095a73e4ff7cafbef5080a7b79 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 1 Sep 2024 09:32:23 +0000 Subject: [PATCH 156/163] Code refacto and clean up --- pandas/_libs/lib.pyx | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d3ab2347c1a05..d3b0f49128e58 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2483,7 +2483,7 @@ def _convert_to_based_masked( @cython.boundscheck(False) @cython.wraparound(False) -def _seen_to_numpy_dtype(Seen seen, object scalar_type): +def _maybe_get_numpy_dtype(Seen seen, object scalar_type): # Numpy scalar type if issubclass(scalar_type, np.generic): return np.dtype(scalar_type) @@ -2500,6 +2500,25 @@ def _seen_to_numpy_dtype(Seen seen, object scalar_type): return None +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_get_based_masked_scalar_numpy_dtype( + val_types, + seen, + convert_to_nullable_dtype): + # If we have no type or more than one type we cannot build a based masked array + if not val_types or len(val_types) > 1: + return None + + numpy_dtype = _maybe_get_numpy_dtype(seen, val_types.pop()) + if ( + numpy_dtype and numpy_dtype.kind in "biuf" + and convert_to_nullable_dtype): + return numpy_dtype + else: + return None + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2715,13 +2734,13 @@ def maybe_convert_objects(ndarray[object] objects, if storage == "pyarrow": return _convert_to_pyarrow(objects, mask, na_value) - numpy_dtype = None - if len(val_types) == 1: - numpy_dtype = _seen_to_numpy_dtype(seen, val_types.pop()) - if ( - numpy_dtype and numpy_dtype.kind in "biuf" - and convert_to_nullable_dtype): - return _convert_to_based_masked(objects, numpy_dtype) + based_masked_scalar_numpy_dtype = _maybe_get_based_masked_scalar_numpy_dtype( + val_types, + seen, + convert_to_nullable_dtype) + + if based_masked_scalar_numpy_dtype: + return _convert_to_based_masked(objects, based_masked_scalar_numpy_dtype) # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: @@ -2789,10 +2808,7 @@ def maybe_convert_objects(ndarray[object] objects, elif storage == "python": from pandas.core.arrays.string_ import StringDtype - if mask is not None and any(mask): - dtype = StringDtype(storage=storage, na_value=objects[mask][0]) - else: - dtype = StringDtype(storage=storage) + dtype = StringDtype(storage=storage, na_value=na_value) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From f84cd8b85bc0264a37dc16647aba81d9636d5c01 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:13:19 +0000 Subject: [PATCH 157/163] Code clean up (restore iterator in map_infer_mask --- pandas/_libs/lib.pyx | 7 +++++-- pandas/core/algorithms.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 71f9331dfc9f1..3e1fe073d1d6b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -3024,16 +3024,17 @@ def map_infer_mask( ndarray result = np.empty(n, dtype=dtype) + flatiter arr_it = PyArray_IterNew(arr) flatiter result_it = PyArray_IterNew(result) for i in range(n): if mask[i]: if na_value is no_default: - val = arr[i] + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) else: val = na_value else: - val = arr[i] + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) val = f(val) if cnp.PyArray_IsZeroDim(val): @@ -3041,6 +3042,8 @@ def map_infer_mask( val = val.item() PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) + + PyArray_ITER_NEXT(arr_it) PyArray_ITER_NEXT(result_it) if convert: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4fd46cb326502..48389dceee0a5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1746,7 +1746,7 @@ def _build_map_infer_methods_params(arr: ArrayLike): storage = None if isinstance(arr.dtype, BaseMaskedDtype): arr = cast("BaseMaskedArray", arr) - values = arr._data + values = np.fromiter(arr._data, dtype="O") if arr._hasna: na_value = arr.dtype.na_value From 3784b5ee02c02b20c66b87ae0b47adb4e6b2dfde Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:57:22 +0000 Subject: [PATCH 158/163] Code simplification --- pandas/core/algorithms.py | 25 +++++++++---------------- pandas/tests/series/methods/test_map.py | 11 +++-------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 48389dceee0a5..e775960615ca6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -62,7 +62,6 @@ BaseMaskedDtype, CategoricalDtype, ExtensionDtype, - IntervalDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( @@ -1665,25 +1664,19 @@ def map_array( # possibility that they are tuples # The return value of mapping with an empty mapper is - # expected to be pd.Series(np.nan, ...). As np.nan is - # of dtype float64 the return value of this method should - # be float64 as well + # expected to be pd.Series(np.nan, ...) or pd.Series(NA, ...). + # As np.nan is of dtype float64 the return value of this method should + # be float64 in this case + # in the other case (NA) it should be the dtype of the original data from pandas import Series + dtype = None if len(mapper) == 0: - if ( - is_extension_array_dtype(arr.dtype) - and not isinstance(arr.dtype, IntervalDtype) - and arr.dtype.na_value is NA - ): - mapper = Series(mapper, dtype=arr.dtype) + if hasattr(arr.dtype, "na_value") and arr.dtype.na_value is NA: + dtype = arr.dtype else: - mapper = Series(mapper, dtype=np.float64) - else: - if arr.dtype in ("string[pyarrow]", "string[python]"): - mapper = Series(mapper, dtype=arr.dtype) - else: - mapper = Series(mapper) + dtype = np.float64 + mapper = Series(mapper, dtype=dtype) if isinstance(mapper, ABCSeries): if na_action == "ignore": diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 7e4c5ad647c84..34543a5c7918d 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_extension_array_dtype -from pandas.core.dtypes.dtypes import IntervalDtype - import pandas as pd from pandas import ( DataFrame, @@ -234,13 +231,11 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - if ( - is_extension_array_dtype(s.dtype) - and not isinstance(s.dtype, IntervalDtype) - and s.dtype.na_value is pd.NA - ): + # If s has a na value equal to NA, we keep the original dtype + if hasattr(s.dtype, "na_value") and s.dtype.na_value is pd.NA: na_value = s.dtype.na_value dtype = s.dtype + # Else the dtype is always float64 else: na_value = np.nan dtype = "float64" From f1fb54e647c498372255f6eb0abbb0e36aa48ffe Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:18:08 +0000 Subject: [PATCH 159/163] Correct pyarrow cast explanation in comment --- pandas/core/strings/object_array.py | 4 ++-- pandas/tests/strings/test_strings.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 4cc28d45d99dc..40625901806b7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -267,8 +267,8 @@ def encode_func(x): if x is str: return x.encode(encoding=encoding, errors=errors) else: - # Manage AttributeError: 'pyarrow.lib.LargeStringScalar' - # object has no attribute 'encode' + # If x is a 'pyarrow.lib.LargeStringScalar' it has + # no attribute 'encode' so we cast it return str(x).encode(encoding=encoding, errors=errors) return self._str_map(encode_func, dtype=object) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index c8acb936e3d2c..a3ef07efee125 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -566,8 +566,8 @@ def encode_func(x): if x is str: return x.encode("cp1252", "ignore") else: - # Manage AttributeError: 'pyarrow.lib.LargeStringScalar' - # object has no attribute 'encode' + # If x is a 'pyarrow.lib.LargeStringScalar' it has + # no attribute 'encode' so we cast it return str(x).encode("cp1252", "ignore") expected = ser.map(encode_func).astype("object") From e54dcdbb2553ac9f2da64c55d0d86a813c3787dd Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:44:00 +0000 Subject: [PATCH 160/163] Code simplification and new comments about scalar type interpretation --- pandas/core/arrays/base.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 788fcf62e8f6b..f3e8f03211a42 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -46,10 +46,7 @@ is_scalar, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - NumpyEADtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, @@ -2490,16 +2487,15 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): a MultiIndex will be returned. """ result = map_array(self, mapper, na_action=na_action) - if isinstance(result, ExtensionArray): - if isinstance(self.dtype, NumpyEADtype): - return pd_array(result, dtype=NumpyEADtype(result.dtype)) - else: - return result - elif isinstance(result, np.ndarray): - result_types = set(np.array([type(x) for x in result])) - - # if internal values types are compatible with self dtype - if all(issubclass(t, self.dtype.type) for t in result_types): + if isinstance(result, np.ndarray): + # Get the scalar types + scalar_types = set(np.array([type(x) for x in result])) + + # if scalar values types are compatible with self dtype + # we use the self dtype + # For example if scalar types are dict and UserDict and self is a JSONArray, + # we use self.dtype + if all(issubclass(t, self.dtype.type) for t in scalar_types): return pd_array(result, self.dtype) else: return pd_array(result, result.dtype) From 1ab81c0277b1326990ebb271e77a66afba57884d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 25 Sep 2024 06:54:19 +0000 Subject: [PATCH 161/163] Code simplification --- pandas/core/algorithms.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e775960615ca6..76b3bebb3f219 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -99,6 +99,7 @@ Series, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, ExtensionArray, ) @@ -1734,35 +1735,33 @@ def _build_map_infer_methods_params(arr: ArrayLike): Values to be processed by lib.map_infer and lib.map_infer_mask """ - na_value = np.nan + na_value = None mask = isna(arr) storage = None + if ( + isinstance(arr.dtype, (BaseMaskedDtype, ExtensionDtype)) + and hasattr(arr, "_hasna") + and arr._hasna + ): + na_value = arr.dtype.na_value + if isinstance(arr.dtype, BaseMaskedDtype): arr = cast("BaseMaskedArray", arr) values = np.fromiter(arr._data, dtype="O") - if arr._hasna: - na_value = arr.dtype.na_value elif isinstance(arr.dtype, ExtensionDtype): - from pandas.core.arrays.string_ import StringDtype - arr = cast("ExtensionArray", arr) - arr_dtype = arr.dtype.__repr__() - if ( - isinstance(arr.dtype, StringDtype) and arr.dtype.storage == "pyarrow" - ) or "pyarrow" in arr_dtype: - storage = "pyarrow" + if hasattr(arr.dtype, "storage"): + storage = arr.dtype.storage + + if storage == "pyarrow": + arr = cast("ArrowExtensionArray", arr) values = np.fromiter(arr._pa_array, dtype="O") else: values = np.asarray(arr) - if ( - isinstance(arr.dtype, StringDtype) and arr.dtype.storage == "python" - ) or "python" in arr_dtype: - storage = "python" - if arr._hasna: - na_value = arr.dtype.na_value else: # we must convert to python types values = arr.astype(object, copy=False) + na_value = np.nan return mask, na_value, storage, values From c23f65ba6f267dc7a10c0a7c980708c9bf2d272c Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 11 Oct 2024 19:31:39 +0000 Subject: [PATCH 162/163] Remove unnecessary test --- pandas/core/algorithms.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 76b3bebb3f219..07c67c048dd54 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1738,10 +1738,8 @@ def _build_map_infer_methods_params(arr: ArrayLike): na_value = None mask = isna(arr) storage = None - if ( - isinstance(arr.dtype, (BaseMaskedDtype, ExtensionDtype)) - and hasattr(arr, "_hasna") - and arr._hasna + if isinstance(arr.dtype, (BaseMaskedDtype, ExtensionDtype)) and hasattr( + arr, "_hasna" ): na_value = arr.dtype.na_value From d1a8190a3eba7f58726ea7e349eb3326fe405368 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 11 Oct 2024 19:55:30 +0000 Subject: [PATCH 163/163] static check correction --- pandas/_libs/lib.pyi | 4 ++-- pandas/core/arrays/arrow/array.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 48e828cf0824a..54dd539eff881 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -76,7 +76,7 @@ def map_infer( ignore_na: bool = ..., mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., - convert_to_nullable_dtype: Literal[False] = ..., + convert_to_nullable_dtype: bool = ..., storage: str | None = ..., ) -> np.ndarray: ... @overload @@ -88,7 +88,7 @@ def map_infer( ignore_na: bool = ..., mask: npt.NDArray[np.bool_] | None = ..., na_value: Any = ..., - convert_to_nullable_dtype: Literal[False] = ..., + convert_to_nullable_dtype: bool = ..., storage: str | None = ..., ) -> ArrayLike: ... @overload diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 09e29e61f4fdc..e513af8d04600 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -343,6 +343,7 @@ def _from_sequence_of_strings( if isinstance(strings, ExtensionArray) and isinstance( strings.dtype, ArrowDtype ): + strings = cast("ArrowExtensionArray", strings) scalars = to_datetime(strings._pa_array, errors="raise").date else: scalars = to_datetime(strings, errors="raise").date