Skip to content

Commit af216fb

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-rank
2 parents b9c4454 + 4444e52 commit af216fb

19 files changed

+392
-354
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
116116
-i "pandas.Timestamp.resolution PR02" \
117117
-i "pandas.Timestamp.tzinfo GL08" \
118118
-i "pandas.Timestamp.year GL08" \
119-
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
120119
-i "pandas.api.types.is_dict_like PR07,SA01" \
121120
-i "pandas.api.types.is_extension_array_dtype SA01" \
122121
-i "pandas.api.types.is_file_like PR07,SA01" \

doc/source/whatsnew/v2.3.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,9 @@ Conversion
103103
Strings
104104
^^^^^^^
105105
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106+
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
106107
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
107-
108+
-
108109

109110
Interval
110111
^^^^^^^^

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
58+
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
5859
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5960
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
6061
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 106 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from functools import partial
4+
import re
45
from typing import (
56
TYPE_CHECKING,
67
Any,
@@ -11,6 +12,7 @@
1112

1213
from pandas.compat import (
1314
pa_version_under10p1,
15+
pa_version_under11p0,
1416
pa_version_under13p0,
1517
pa_version_under17p0,
1618
)
@@ -22,10 +24,7 @@
2224
import pyarrow.compute as pc
2325

2426
if TYPE_CHECKING:
25-
from collections.abc import (
26-
Callable,
27-
Sized,
28-
)
27+
from collections.abc import Callable
2928

3029
from pandas._typing import (
3130
Scalar,
@@ -34,7 +33,7 @@
3433

3534

3635
class ArrowStringArrayMixin:
37-
_pa_array: Sized
36+
_pa_array: pa.ChunkedArray
3837

3938
def __init__(self, *args, **kwargs) -> None:
4039
raise NotImplementedError
@@ -50,6 +49,37 @@ def _convert_int_result(self, result):
5049
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
5150
raise NotImplementedError
5251

52+
def _str_len(self):
53+
result = pc.utf8_length(self._pa_array)
54+
return self._convert_int_result(result)
55+
56+
def _str_lower(self) -> Self:
57+
return type(self)(pc.utf8_lower(self._pa_array))
58+
59+
def _str_upper(self) -> Self:
60+
return type(self)(pc.utf8_upper(self._pa_array))
61+
62+
def _str_strip(self, to_strip=None) -> Self:
63+
if to_strip is None:
64+
result = pc.utf8_trim_whitespace(self._pa_array)
65+
else:
66+
result = pc.utf8_trim(self._pa_array, characters=to_strip)
67+
return type(self)(result)
68+
69+
def _str_lstrip(self, to_strip=None) -> Self:
70+
if to_strip is None:
71+
result = pc.utf8_ltrim_whitespace(self._pa_array)
72+
else:
73+
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
74+
return type(self)(result)
75+
76+
def _str_rstrip(self, to_strip=None) -> Self:
77+
if to_strip is None:
78+
result = pc.utf8_rtrim_whitespace(self._pa_array)
79+
else:
80+
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
81+
return type(self)(result)
82+
5383
def _str_pad(
5484
self,
5585
width: int,
@@ -96,13 +126,29 @@ def _str_get(self, i: int) -> Self:
96126
selected = pc.utf8_slice_codeunits(
97127
self._pa_array, start=start, stop=stop, step=step
98128
)
99-
null_value = pa.scalar(
100-
None,
101-
type=self._pa_array.type, # type: ignore[attr-defined]
102-
)
129+
null_value = pa.scalar(None, type=self._pa_array.type)
103130
result = pc.if_else(not_out_of_bounds, selected, null_value)
104131
return type(self)(result)
105132

133+
def _str_slice(
134+
self, start: int | None = None, stop: int | None = None, step: int | None = None
135+
) -> Self:
136+
if pa_version_under11p0:
137+
# GH#59724
138+
result = self._apply_elementwise(lambda val: val[start:stop:step])
139+
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
140+
if start is None:
141+
if step is not None and step < 0:
142+
# GH#59710
143+
start = -1
144+
else:
145+
start = 0
146+
if step is None:
147+
step = 1
148+
return type(self)(
149+
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
150+
)
151+
106152
def _str_slice_replace(
107153
self, start: int | None = None, stop: int | None = None, repl: str | None = None
108154
) -> Self:
@@ -114,6 +160,33 @@ def _str_slice_replace(
114160
stop = np.iinfo(np.int64).max
115161
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
116162

163+
def _str_replace(
164+
self,
165+
pat: str | re.Pattern,
166+
repl: str | Callable,
167+
n: int = -1,
168+
case: bool = True,
169+
flags: int = 0,
170+
regex: bool = True,
171+
) -> Self:
172+
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
173+
raise NotImplementedError(
174+
"replace is not supported with a re.Pattern, callable repl, "
175+
"case=False, or flags!=0"
176+
)
177+
178+
func = pc.replace_substring_regex if regex else pc.replace_substring
179+
# https://github.com/apache/arrow/issues/39149
180+
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
181+
pa_max_replacements = None if n < 0 else n
182+
result = func(
183+
self._pa_array,
184+
pattern=pat,
185+
replacement=repl,
186+
max_replacements=pa_max_replacements,
187+
)
188+
return type(self)(result)
189+
117190
def _str_capitalize(self) -> Self:
118191
return type(self)(pc.utf8_capitalize(self._pa_array))
119192

@@ -123,6 +196,16 @@ def _str_title(self) -> Self:
123196
def _str_swapcase(self) -> Self:
124197
return type(self)(pc.utf8_swapcase(self._pa_array))
125198

199+
def _str_removeprefix(self, prefix: str):
200+
if not pa_version_under13p0:
201+
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
202+
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
203+
result = pc.if_else(starts_with, removed, self._pa_array)
204+
return type(self)(result)
205+
predicate = lambda val: val.removeprefix(prefix)
206+
result = self._apply_elementwise(predicate)
207+
return type(self)(pa.chunked_array(result))
208+
126209
def _str_removesuffix(self, suffix: str):
127210
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
128211
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
@@ -214,6 +297,20 @@ def _str_contains(
214297
result = result.fill_null(na)
215298
return self._convert_bool_result(result)
216299

300+
def _str_match(
301+
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
302+
):
303+
if not pat.startswith("^"):
304+
pat = f"^{pat}"
305+
return self._str_contains(pat, case, flags, na, regex=True)
306+
307+
def _str_fullmatch(
308+
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
309+
):
310+
if not pat.endswith("$") or pat.endswith("\\$"):
311+
pat = f"{pat}$"
312+
return self._str_match(pat, case, flags, na)
313+
217314
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
218315
if (
219316
pa_version_under13p0

pandas/core/arrays/arrow/_arrow_utils.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,8 @@
11
from __future__ import annotations
22

3-
import warnings
4-
53
import numpy as np
64
import pyarrow
75

8-
from pandas._config.config import get_option
9-
10-
from pandas.errors import PerformanceWarning
11-
from pandas.util._exceptions import find_stack_level
12-
13-
14-
def fallback_performancewarning(version: str | None = None) -> None:
15-
"""
16-
Raise a PerformanceWarning for falling back to ExtensionArray's
17-
non-pyarrow method
18-
"""
19-
if get_option("performance_warnings"):
20-
msg = "Falling back on a non-pyarrow code path which may decrease performance."
21-
if version is not None:
22-
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
23-
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
24-
256

267
def pyarrow_array_to_numpy_and_mask(
278
arr, dtype: np.dtype

0 commit comments

Comments
 (0)