diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst index 7b05907bd0563..e570e7d5d6cd8 100644 --- a/doc/source/whatsnew/v2.3.2.rst +++ b/doc/source/whatsnew/v2.3.2.rst @@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ +- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript + characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`) - Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the "string" type in the JSON Table Schema for :class:`StringDtype` columns (:issue:`61889`) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 5da94b5987cfc..c99ee7d02a226 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -16,6 +16,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, + pa_version_under21p0, ) if not pa_version_under10p1: @@ -268,6 +269,12 @@ def _str_isdecimal(self): return self._convert_bool_result(result) def _str_isdigit(self): + if pa_version_under21p0: + # https://github.com/pandas-dev/pandas/issues/61466 + res_list = self._apply_elementwise(str.isdigit) + return self._convert_bool_result( + pa.chunked_array(res_list, type=pa.bool_()) + ) result = pc.utf8_is_digit(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index cdb3f4ba814a2..5ac343b45c0c6 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -7,12 +7,15 @@ import numpy as np import pytest +from pandas.compat import pa_version_under21p0 + from pandas import ( NA, DataFrame, Index, MultiIndex, Series, + StringDtype, ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods @@ -240,8 +243,9 @@ def test_ismethods(method, expected, any_string_dtype): @pytest.mark.parametrize( "method, expected", [ - ("isnumeric", [False, True, True, False, True, True, False]), - ("isdecimal", [False, True, False, False, False, True, False]), + ("isnumeric", [False, True, True, True, False, True, True, False]), + ("isdecimal", [False, True, False, False, False, False, True, False]), + ("isdigit", [False, True, True, False, False, False, True, False]), ], ) def test_isnumeric_unicode(method, expected, any_string_dtype): @@ -250,18 +254,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 # noqa: RUF003 ser = Series( - ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 + ["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001 + dtype=any_string_dtype, ) expected_dtype = ( "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series(expected, dtype=expected_dtype) + if ( + method == "isdigit" + and isinstance(ser.dtype, StringDtype) + and ser.dtype.storage == "pyarrow" + and not pa_version_under21p0 + ): + # known difference in behavior between python and pyarrow unicode handling + # pyarrow 21+ considers ¼ and ፸ as a digit, while python does not + expected.iloc[3] = True + expected.iloc[5] = True + result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + # (only for non-pyarrow storage given the above differences) + if any_string_dtype == "object" or ( + isinstance(any_string_dtype, StringDtype) + and any_string_dtype.storage == "python" + ): + expected = [getattr(item, method)() for item in ser] + assert list(result) == expected @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")