Merge branch 'main' into api-nan-vs-na

jbrockmendel · jbrockmendel · commit b4dcfa609556 · 2025-09-25T14:09:36.000-07:00
diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst
@@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See
 
 Bug fixes
 ^^^^^^^^^
+- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript
+  characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`)
 - Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
   "string" type in the JSON Table Schema for :class:`StringDtype` columns
   (:issue:`61889`)
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
@@ -15,6 +15,7 @@
 from pandas.compat import (
     HAS_PYARROW,
     pa_version_under17p0,
+    pa_version_under21p0,
 )
 
 if HAS_PYARROW:
@@ -267,6 +268,12 @@ def _str_isdecimal(self):
         return self._convert_bool_result(result)
 
     def _str_isdigit(self):
+        if pa_version_under21p0:
+            # https://github.com/pandas-dev/pandas/issues/61466
+            res_list = self._apply_elementwise(str.isdigit)
+            return self._convert_bool_result(
+                pa.chunked_array(res_list, type=pa.bool_())
+            )
         result = pc.utf8_is_digit(self._pa_array)
         return self._convert_bool_result(result)
 
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -3602,16 +3602,26 @@ def casefold(self):
     Series.str.isupper : Check whether all characters are uppercase.
     Series.str.istitle : Check whether all characters are titlecase.
 
-    Examples
-    --------
+    Notes
+    -----
     Similar to ``str.isdecimal`` but also includes special digits, like
     superscripted and subscripted digits in unicode.
 
+    The exact behavior of this method, i.e. which unicode characters are
+    considered as digits, depends on the backend used for string operations,
+    and there can be small differences.
+    For example, Python considers the ³ superscript character as a digit, but
+    not the ⅕ fraction character, while PyArrow considers both as digits. For
+    simple (ascii) decimal numbers, the behaviour is consistent.
+
+    Examples
+    --------
+
     >>> s3 = pd.Series(['23', '³', '⅕', ''])
     >>> s3.str.isdigit()
     0     True
-    1    False
-    2    False
+    1     True
+    2     True
     3    False
     dtype: bool
     """
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -3786,7 +3786,7 @@ def from_custom_template(
         Returns
         -------
         MyStyler : subclass of Styler
-            Has the correct ``env``,``template_html``, ``template_html_table`` and
+            Has the correct ``env``, ``template_html``, ``template_html_table`` and
             ``template_html_style`` class attributes set.
 
         See Also
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
@@ -8,6 +8,7 @@
 import pytest
 
 from pandas.compat import PY312
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     NumExprClobberingError,
     PerformanceWarning,
@@ -53,6 +54,9 @@
     _unary_math_ops,
 )
 from pandas.core.computation.scope import DEFAULT_GLOBALS
+from pandas.util.version import Version
+
+numexpr = import_optional_dependency("numexpr", errors="ignore")
 
 
 @pytest.fixture(
@@ -320,7 +324,9 @@ def test_modulus(self, lhs, rhs, engine, parser):
     def test_floor_division(self, lhs, rhs, engine, parser):
         ex = "lhs // rhs"
 
-        if engine == "python":
+        if engine == "python" or (
+            engine == "numexpr" and Version(numexpr.__version__) >= Version("2.13.0")
+        ):
             res = pd.eval(ex, engine=engine, parser=parser)
             expected = lhs // rhs
             tm.assert_equal(res, expected)
@@ -391,7 +397,7 @@ def test_frame_invert(self, engine, parser):
 
         # int raises on numexpr
         lhs = DataFrame(np.random.default_rng(2).integers(5, size=(5, 2)))
-        if engine == "numexpr":
+        if engine == "numexpr" and Version(numexpr.__version__) < Version("2.13.0"):
             msg = "couldn't find matching opcode for 'invert"
             with pytest.raises(NotImplementedError, match=msg):
                 pd.eval(expr, engine=engine, parser=parser)
@@ -436,7 +442,7 @@ def test_series_invert(self, engine, parser):
 
         # int raises on numexpr
         lhs = Series(np.random.default_rng(2).integers(5, size=5))
-        if engine == "numexpr":
+        if engine == "numexpr" and Version(numexpr.__version__) < Version("2.13.0"):
             msg = "couldn't find matching opcode for 'invert"
             with pytest.raises(NotImplementedError, match=msg):
                 pd.eval(expr, engine=engine, parser=parser)
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under21p0
 from pandas.errors import Pandas4Warning
 
 from pandas import (
@@ -15,6 +16,7 @@
     Index,
     MultiIndex,
     Series,
+    StringDtype,
     option_context,
 )
 import pandas._testing as tm
@@ -249,8 +251,9 @@ def test_ismethods(method, expected, any_string_dtype):
 @pytest.mark.parametrize(
     "method, expected",
     [
-        ("isnumeric", [False, True, True, False, True, True, False]),
-        ("isdecimal", [False, True, False, False, False, True, False]),
+        ("isnumeric", [False, True, True, True, False, True, True, False]),
+        ("isdecimal", [False, True, False, False, False, False, True, False]),
+        ("isdigit", [False, True, True, False, False, False, True, False]),
     ],
 )
 def test_isnumeric_unicode(method, expected, any_string_dtype):
@@ -259,19 +262,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
     # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
     # 0xFF13: ３ Em 3  # noqa: RUF003
     ser = Series(
-        ["A", "3", "¼", "★", "፸", "３", "four"],  # noqa: RUF001
+        ["A", "3", "³", "¼", "★", "፸", "３", "four"],  # noqa: RUF001
         dtype=any_string_dtype,
     )
     expected_dtype = (
         "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
     )
     expected = Series(expected, dtype=expected_dtype)
+    if (
+        method == "isdigit"
+        and isinstance(ser.dtype, StringDtype)
+        and ser.dtype.storage == "pyarrow"
+        and not pa_version_under21p0
+    ):
+        # known difference in behavior between python and pyarrow unicode handling
+        # pyarrow 21+ considers ¼ and ፸ as a digit, while python does not
+        expected.iloc[3] = True
+        expected.iloc[5] = True
+
     result = getattr(ser.str, method)()
     tm.assert_series_equal(result, expected)
 
     # compare with standard library
-    expected = [getattr(item, method)() for item in ser]
-    assert list(result) == expected
+    # (only for non-pyarrow storage given the above differences)
+    if any_string_dtype == "object" or (
+        isinstance(any_string_dtype, StringDtype)
+        and any_string_dtype.storage == "python"
+    ):
+        expected = [getattr(item, method)() for item in ser]
+        assert list(result) == expected
 
 
 @pytest.mark.parametrize(
diff --git a/pyproject.toml b/pyproject.toml