Skip to content

Commit 8559c1f

Browse files
committed
Merge remote-tracking branch 'upstream/main' into series-sum-attrs
2 parents 87abede + 0d2505d commit 8559c1f

File tree

19 files changed

+281
-242
lines changed

19 files changed

+281
-242
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
153153
-i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
154154
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
155155
-i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
156-
-i "pandas.core.groupby.DataFrameGroupBy.max SA01" \
157-
-i "pandas.core.groupby.DataFrameGroupBy.min SA01" \
158156
-i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
159157
-i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
160158
-i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \
161159
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
162160
-i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
163-
-i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \
164161
-i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
165162
-i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
166163
-i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
@@ -169,13 +166,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
169166
-i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
170167
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
171168
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
172-
-i "pandas.core.groupby.SeriesGroupBy.max SA01" \
173-
-i "pandas.core.groupby.SeriesGroupBy.min SA01" \
174169
-i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
175170
-i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
176171
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
177172
-i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
178-
-i "pandas.core.groupby.SeriesGroupBy.sum SA01" \
179173
-i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
180174
-i "pandas.core.resample.Resampler.ffill RT03" \
181175
-i "pandas.core.resample.Resampler.get_group RT03,SA01" \

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ Conversion
102102

103103
Strings
104104
^^^^^^^
105+
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
105106
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106107
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
107108
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ I/O
627627
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
628628
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
629629
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
630+
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
630631
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
631632
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
632633
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

pandas/_libs/lib.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool:
600600
if not array_equivalent(x, y):
601601
return False
602602

603+
elif PyArray_Check(x) or PyArray_Check(y):
604+
return False
603605
elif (x is C_NA) ^ (y is C_NA):
604606
return False
605607
elif not (

pandas/conftest.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1338,7 +1338,13 @@ def string_storage(request):
13381338
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
13391339
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
13401340
("python", np.nan),
1341-
]
1341+
],
1342+
ids=[
1343+
"string=string[python]",
1344+
"string=string[pyarrow]",
1345+
"string=str[pyarrow]",
1346+
"string=str[python]",
1347+
],
13421348
)
13431349
def string_dtype_arguments(request):
13441350
"""
@@ -1369,6 +1375,7 @@ def dtype_backend(request):
13691375

13701376
# Alias so we can test with cartesian product of string_storage
13711377
string_storage2 = string_storage
1378+
string_dtype_arguments2 = string_dtype_arguments
13721379

13731380

13741381
@pytest.fixture(params=tm.BYTES_DTYPES)

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from functools import partial
4+
import re
45
from typing import (
56
TYPE_CHECKING,
67
Any,
@@ -48,6 +49,37 @@ def _convert_int_result(self, result):
4849
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
4950
raise NotImplementedError
5051

52+
def _str_len(self):
53+
result = pc.utf8_length(self._pa_array)
54+
return self._convert_int_result(result)
55+
56+
def _str_lower(self) -> Self:
57+
return type(self)(pc.utf8_lower(self._pa_array))
58+
59+
def _str_upper(self) -> Self:
60+
return type(self)(pc.utf8_upper(self._pa_array))
61+
62+
def _str_strip(self, to_strip=None) -> Self:
63+
if to_strip is None:
64+
result = pc.utf8_trim_whitespace(self._pa_array)
65+
else:
66+
result = pc.utf8_trim(self._pa_array, characters=to_strip)
67+
return type(self)(result)
68+
69+
def _str_lstrip(self, to_strip=None) -> Self:
70+
if to_strip is None:
71+
result = pc.utf8_ltrim_whitespace(self._pa_array)
72+
else:
73+
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
74+
return type(self)(result)
75+
76+
def _str_rstrip(self, to_strip=None) -> Self:
77+
if to_strip is None:
78+
result = pc.utf8_rtrim_whitespace(self._pa_array)
79+
else:
80+
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
81+
return type(self)(result)
82+
5183
def _str_pad(
5284
self,
5385
width: int,
@@ -128,6 +160,33 @@ def _str_slice_replace(
128160
stop = np.iinfo(np.int64).max
129161
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
130162

163+
def _str_replace(
164+
self,
165+
pat: str | re.Pattern,
166+
repl: str | Callable,
167+
n: int = -1,
168+
case: bool = True,
169+
flags: int = 0,
170+
regex: bool = True,
171+
) -> Self:
172+
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
173+
raise NotImplementedError(
174+
"replace is not supported with a re.Pattern, callable repl, "
175+
"case=False, or flags!=0"
176+
)
177+
178+
func = pc.replace_substring_regex if regex else pc.replace_substring
179+
# https://github.com/apache/arrow/issues/39149
180+
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
181+
pa_max_replacements = None if n < 0 else n
182+
result = func(
183+
self._pa_array,
184+
pattern=pat,
185+
replacement=repl,
186+
max_replacements=pa_max_replacements,
187+
)
188+
return type(self)(result)
189+
131190
def _str_capitalize(self) -> Self:
132191
return type(self)(pc.utf8_capitalize(self._pa_array))
133192

@@ -137,6 +196,16 @@ def _str_title(self) -> Self:
137196
def _str_swapcase(self) -> Self:
138197
return type(self)(pc.utf8_swapcase(self._pa_array))
139198

199+
def _str_removeprefix(self, prefix: str):
200+
if not pa_version_under13p0:
201+
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
202+
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
203+
result = pc.if_else(starts_with, removed, self._pa_array)
204+
return type(self)(result)
205+
predicate = lambda val: val.removeprefix(prefix)
206+
result = self._apply_elementwise(predicate)
207+
return type(self)(pa.chunked_array(result))
208+
140209
def _str_removesuffix(self, suffix: str):
141210
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
142211
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
@@ -228,6 +297,20 @@ def _str_contains(
228297
result = result.fill_null(na)
229298
return self._convert_bool_result(result)
230299

300+
def _str_match(
301+
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
302+
):
303+
if not pat.startswith("^"):
304+
pat = f"^{pat}"
305+
return self._str_contains(pat, case, flags, na, regex=True)
306+
307+
def _str_fullmatch(
308+
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
309+
):
310+
if not pat.endswith("$") or pat.endswith("\\$"):
311+
pat = f"{pat}$"
312+
return self._str_match(pat, case, flags, na)
313+
231314
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
232315
if (
233316
pa_version_under13p0

pandas/core/arrays/arrow/array.py

Lines changed: 4 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,7 +1999,7 @@ def _rank(
19991999
"""
20002000
See Series.rank.__doc__.
20012001
"""
2002-
return type(self)(
2002+
return self._convert_rank_result(
20032003
self._rank_calc(
20042004
axis=axis,
20052005
method=method,
@@ -2318,62 +2318,21 @@ def _convert_bool_result(self, result):
23182318
def _convert_int_result(self, result):
23192319
return type(self)(result)
23202320

2321+
def _convert_rank_result(self, result):
2322+
return type(self)(result)
2323+
23212324
def _str_count(self, pat: str, flags: int = 0) -> Self:
23222325
if flags:
23232326
raise NotImplementedError(f"count not implemented with {flags=}")
23242327
return type(self)(pc.count_substring_regex(self._pa_array, pat))
23252328

2326-
def _result_converter(self, result):
2327-
return type(self)(result)
2328-
2329-
def _str_replace(
2330-
self,
2331-
pat: str | re.Pattern,
2332-
repl: str | Callable,
2333-
n: int = -1,
2334-
case: bool = True,
2335-
flags: int = 0,
2336-
regex: bool = True,
2337-
) -> Self:
2338-
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
2339-
raise NotImplementedError(
2340-
"replace is not supported with a re.Pattern, callable repl, "
2341-
"case=False, or flags!=0"
2342-
)
2343-
2344-
func = pc.replace_substring_regex if regex else pc.replace_substring
2345-
# https://github.com/apache/arrow/issues/39149
2346-
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
2347-
pa_max_replacements = None if n < 0 else n
2348-
result = func(
2349-
self._pa_array,
2350-
pattern=pat,
2351-
replacement=repl,
2352-
max_replacements=pa_max_replacements,
2353-
)
2354-
return type(self)(result)
2355-
23562329
def _str_repeat(self, repeats: int | Sequence[int]) -> Self:
23572330
if not isinstance(repeats, int):
23582331
raise NotImplementedError(
23592332
f"repeat is not implemented when repeats is {type(repeats).__name__}"
23602333
)
23612334
return type(self)(pc.binary_repeat(self._pa_array, repeats))
23622335

2363-
def _str_match(
2364-
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
2365-
) -> Self:
2366-
if not pat.startswith("^"):
2367-
pat = f"^{pat}"
2368-
return self._str_contains(pat, case, flags, na, regex=True)
2369-
2370-
def _str_fullmatch(
2371-
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
2372-
) -> Self:
2373-
if not pat.endswith("$") or pat.endswith("\\$"):
2374-
pat = f"{pat}$"
2375-
return self._str_match(pat, case, flags, na)
2376-
23772336
def _str_join(self, sep: str) -> Self:
23782337
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
23792338
self._pa_array.type
@@ -2394,46 +2353,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
23942353
result = self._apply_elementwise(predicate)
23952354
return type(self)(pa.chunked_array(result))
23962355

2397-
def _str_len(self) -> Self:
2398-
return type(self)(pc.utf8_length(self._pa_array))
2399-
2400-
def _str_lower(self) -> Self:
2401-
return type(self)(pc.utf8_lower(self._pa_array))
2402-
2403-
def _str_upper(self) -> Self:
2404-
return type(self)(pc.utf8_upper(self._pa_array))
2405-
2406-
def _str_strip(self, to_strip=None) -> Self:
2407-
if to_strip is None:
2408-
result = pc.utf8_trim_whitespace(self._pa_array)
2409-
else:
2410-
result = pc.utf8_trim(self._pa_array, characters=to_strip)
2411-
return type(self)(result)
2412-
2413-
def _str_lstrip(self, to_strip=None) -> Self:
2414-
if to_strip is None:
2415-
result = pc.utf8_ltrim_whitespace(self._pa_array)
2416-
else:
2417-
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
2418-
return type(self)(result)
2419-
2420-
def _str_rstrip(self, to_strip=None) -> Self:
2421-
if to_strip is None:
2422-
result = pc.utf8_rtrim_whitespace(self._pa_array)
2423-
else:
2424-
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
2425-
return type(self)(result)
2426-
2427-
def _str_removeprefix(self, prefix: str):
2428-
if not pa_version_under13p0:
2429-
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
2430-
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
2431-
result = pc.if_else(starts_with, removed, self._pa_array)
2432-
return type(self)(result)
2433-
predicate = lambda val: val.removeprefix(prefix)
2434-
result = self._apply_elementwise(predicate)
2435-
return type(self)(pa.chunked_array(result))
2436-
24372356
def _str_casefold(self) -> Self:
24382357
predicate = lambda val: val.casefold()
24392358
result = self._apply_elementwise(predicate)

pandas/core/arrays/string_.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
nanops,
4747
ops,
4848
)
49+
from pandas.core.algorithms import isin
4950
from pandas.core.array_algos import masked_reductions
5051
from pandas.core.arrays.base import ExtensionArray
5152
from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
6566
import pyarrow
6667

6768
from pandas._typing import (
69+
ArrayLike,
6870
AxisInt,
6971
Dtype,
7072
DtypeObj,
@@ -735,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
735737
# base class implementation that uses __setitem__
736738
ExtensionArray._putmask(self, mask, value)
737739

740+
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
741+
if isinstance(values, BaseStringArray) or (
742+
isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
743+
):
744+
values = values.astype(self.dtype, copy=False)
745+
else:
746+
if not lib.is_string_array(np.asarray(values), skipna=True):
747+
values = np.array(
748+
[val for val in values if isinstance(val, str) or isna(val)],
749+
dtype=object,
750+
)
751+
if not len(values):
752+
return np.zeros(self.shape, dtype=bool)
753+
754+
values = self._from_sequence(values, dtype=self.dtype)
755+
756+
return isin(np.asarray(self), np.asarray(values))
757+
738758
def astype(self, dtype, copy: bool = True):
739759
dtype = pandas_dtype(dtype)
740760

0 commit comments

Comments
 (0)