Skip to content

Commit f14f131

Browse files
BUG: fix .str.isdigit to honor unicode superscript for older pyarrow (#61962)
1 parent 0f4222e commit f14f131

File tree

4 files changed

+47
-9
lines changed

4 files changed

+47
-9
lines changed

doc/source/whatsnew/v2.3.2.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See
2222

2323
Bug fixes
2424
^^^^^^^^^
25+
- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript
26+
characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`)
2527
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
2628
"string" type in the JSON Table Schema for :class:`StringDtype` columns
2729
(:issue:`61889`)

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pandas.compat import (
1616
HAS_PYARROW,
1717
pa_version_under17p0,
18+
pa_version_under21p0,
1819
)
1920

2021
if HAS_PYARROW:
@@ -267,6 +268,12 @@ def _str_isdecimal(self):
267268
return self._convert_bool_result(result)
268269

269270
def _str_isdigit(self):
271+
if pa_version_under21p0:
272+
# https://github.com/pandas-dev/pandas/issues/61466
273+
res_list = self._apply_elementwise(str.isdigit)
274+
return self._convert_bool_result(
275+
pa.chunked_array(res_list, type=pa.bool_())
276+
)
270277
result = pc.utf8_is_digit(self._pa_array)
271278
return self._convert_bool_result(result)
272279

pandas/core/strings/accessor.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3602,16 +3602,26 @@ def casefold(self):
36023602
Series.str.isupper : Check whether all characters are uppercase.
36033603
Series.str.istitle : Check whether all characters are titlecase.
36043604
3605-
Examples
3606-
--------
3605+
Notes
3606+
-----
36073607
Similar to ``str.isdecimal`` but also includes special digits, like
36083608
superscripted and subscripted digits in unicode.
36093609
3610+
The exact behavior of this method, i.e. which unicode characters are
3611+
considered as digits, depends on the backend used for string operations,
3612+
and there can be small differences.
3613+
For example, Python considers the ³ superscript character as a digit, but
3614+
not the ⅕ fraction character, while PyArrow considers both as digits. For
3615+
simple (ascii) decimal numbers, the behaviour is consistent.
3616+
3617+
Examples
3618+
--------
3619+
36103620
>>> s3 = pd.Series(['23', '³', '⅕', ''])
36113621
>>> s3.str.isdigit()
36123622
0 True
3613-
1 False
3614-
2 False
3623+
1 True
3624+
2 True
36153625
3 False
36163626
dtype: bool
36173627
"""

pandas/tests/strings/test_strings.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import pytest
99

10+
from pandas.compat import pa_version_under21p0
1011
from pandas.errors import Pandas4Warning
1112

1213
from pandas import (
@@ -15,6 +16,7 @@
1516
Index,
1617
MultiIndex,
1718
Series,
19+
StringDtype,
1820
option_context,
1921
)
2022
import pandas._testing as tm
@@ -249,8 +251,9 @@ def test_ismethods(method, expected, any_string_dtype):
249251
@pytest.mark.parametrize(
250252
"method, expected",
251253
[
252-
("isnumeric", [False, True, True, False, True, True, False]),
253-
("isdecimal", [False, True, False, False, False, True, False]),
254+
("isnumeric", [False, True, True, True, False, True, True, False]),
255+
("isdecimal", [False, True, False, False, False, False, True, False]),
256+
("isdigit", [False, True, True, False, False, False, True, False]),
254257
],
255258
)
256259
def test_isnumeric_unicode(method, expected, any_string_dtype):
@@ -259,19 +262,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
259262
# 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
260263
# 0xFF13: 3 Em 3 # noqa: RUF003
261264
ser = Series(
262-
["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001
265+
["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001
263266
dtype=any_string_dtype,
264267
)
265268
expected_dtype = (
266269
"bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
267270
)
268271
expected = Series(expected, dtype=expected_dtype)
272+
if (
273+
method == "isdigit"
274+
and isinstance(ser.dtype, StringDtype)
275+
and ser.dtype.storage == "pyarrow"
276+
and not pa_version_under21p0
277+
):
278+
# known difference in behavior between python and pyarrow unicode handling
279+
# pyarrow 21+ considers ¼ and ፸ as a digit, while python does not
280+
expected.iloc[3] = True
281+
expected.iloc[5] = True
282+
269283
result = getattr(ser.str, method)()
270284
tm.assert_series_equal(result, expected)
271285

272286
# compare with standard library
273-
expected = [getattr(item, method)() for item in ser]
274-
assert list(result) == expected
287+
# (only for non-pyarrow storage given the above differences)
288+
if any_string_dtype == "object" or (
289+
isinstance(any_string_dtype, StringDtype)
290+
and any_string_dtype.storage == "python"
291+
):
292+
expected = [getattr(item, method)() for item in ser]
293+
assert list(result) == expected
275294

276295

277296
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)