Skip to content

Commit b154203

Browse files
committed
Merge remote-tracking branch 'upstream/main' into series-sum-attrs
2 parents 2607f9c + 16b7288 commit b154203

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+1101
-699
lines changed

.github/workflows/unit-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ jobs:
380380
fetch-depth: 0
381381

382382
- name: Set up Python Free-threading Version
383-
uses: deadsnakes/action@v3.1.0
383+
uses: deadsnakes/action@v3.2.0
384384
with:
385385
python-version: 3.13-dev
386386
nogil: true

ci/code_checks.sh

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7373
-i "pandas.NA SA01" \
7474
-i "pandas.Period.freq GL08" \
7575
-i "pandas.Period.ordinal GL08" \
76-
-i "pandas.Period.to_timestamp SA01" \
7776
-i "pandas.PeriodDtype.freq SA01" \
7877
-i "pandas.RangeIndex.from_range PR01,SA01" \
79-
-i "pandas.RangeIndex.start SA01" \
8078
-i "pandas.RangeIndex.step SA01" \
81-
-i "pandas.RangeIndex.stop SA01" \
8279
-i "pandas.Series.cat.add_categories PR01,PR02" \
8380
-i "pandas.Series.cat.as_ordered PR01" \
8481
-i "pandas.Series.cat.as_unordered PR01" \
@@ -93,10 +90,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
9390
-i "pandas.Series.dt.floor PR01,PR02" \
9491
-i "pandas.Series.dt.freq GL08" \
9592
-i "pandas.Series.dt.month_name PR01,PR02" \
96-
-i "pandas.Series.dt.nanoseconds SA01" \
9793
-i "pandas.Series.dt.normalize PR01" \
9894
-i "pandas.Series.dt.round PR01,PR02" \
99-
-i "pandas.Series.dt.seconds SA01" \
10095
-i "pandas.Series.dt.strftime PR01,PR02" \
10196
-i "pandas.Series.dt.to_period PR01,PR02" \
10297
-i "pandas.Series.dt.total_seconds PR01" \
@@ -114,24 +109,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
114109
-i "pandas.Timedelta.resolution PR02" \
115110
-i "pandas.Timedelta.to_timedelta64 SA01" \
116111
-i "pandas.Timedelta.total_seconds SA01" \
117-
-i "pandas.Timedelta.view SA01" \
118-
-i "pandas.TimedeltaIndex.nanoseconds SA01" \
119-
-i "pandas.TimedeltaIndex.seconds SA01" \
120112
-i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
121113
-i "pandas.Timestamp.max PR02" \
122114
-i "pandas.Timestamp.min PR02" \
123115
-i "pandas.Timestamp.nanosecond GL08" \
124116
-i "pandas.Timestamp.resolution PR02" \
125117
-i "pandas.Timestamp.tzinfo GL08" \
126118
-i "pandas.Timestamp.year GL08" \
127-
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
128-
-i "pandas.api.types.is_bool PR01,SA01" \
129-
-i "pandas.api.types.is_categorical_dtype SA01" \
130-
-i "pandas.api.types.is_complex PR01,SA01" \
131-
-i "pandas.api.types.is_complex_dtype SA01" \
132-
-i "pandas.api.types.is_datetime64_dtype SA01" \
133-
-i "pandas.api.types.is_datetime64_ns_dtype SA01" \
134-
-i "pandas.api.types.is_datetime64tz_dtype SA01" \
135119
-i "pandas.api.types.is_dict_like PR07,SA01" \
136120
-i "pandas.api.types.is_extension_array_dtype SA01" \
137121
-i "pandas.api.types.is_file_like PR07,SA01" \
@@ -165,7 +149,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
165149
-i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \
166150
-i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \
167151
-i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
168-
-i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \
169152
-i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
170153
-i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
171154
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
@@ -181,7 +164,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
181164
-i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
182165
-i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
183166
-i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
184-
-i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \
185167
-i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
186168
-i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
187169
-i "pandas.core.groupby.SeriesGroupBy.indices SA01" \

doc/source/whatsnew/v2.3.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,9 @@ Conversion
103103
Strings
104104
^^^^^^^
105105
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106+
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
106107
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
107-
108+
-
108109

109110
Interval
110111
^^^^^^^^

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
58+
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
5859
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5960
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
6061
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
@@ -668,6 +669,7 @@ Reshaping
668669
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
669670
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
670671
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
672+
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
671673
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
672674

673675
Sparse

pandas/_libs/lib.pyx

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
733733
convert_na_value : bool, default True
734734
If False, existing na values will be used unchanged in the new array.
735735
copy : bool, default True
736-
Whether to ensure that a new array is returned.
736+
Whether to ensure that a new array is returned. When True, a new array
737+
is always returned. When False, a new array is only returned when needed
738+
to avoid mutating the input array.
737739
skipna : bool, default True
738740
Whether or not to coerce nulls to their stringified form
739741
(e.g. if False, NaN becomes 'nan').
@@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(
762764

763765
result = np.asarray(arr, dtype="object")
764766

765-
if copy and (result is arr or np.shares_memory(arr, result)):
766-
# GH#54654
767-
result = result.copy()
768-
elif not copy and result is arr:
769-
already_copied = False
767+
if result is arr or np.may_share_memory(arr, result):
768+
# if np.asarray(..) did not make a copy of the input arr, we still need
769+
# to do that to avoid mutating the input array
770+
# GH#54654: share_memory check is needed for rare cases where np.asarray
771+
# returns a new object without making a copy of the actual data
772+
if copy:
773+
result = result.copy()
774+
else:
775+
already_copied = False
770776
elif not copy and not result.flags.writeable:
771777
# Weird edge case where result is a view
772778
already_copied = False
@@ -1123,10 +1129,21 @@ def is_bool(obj: object) -> bool:
11231129
"""
11241130
Return True if given object is boolean.
11251131

1132+
Parameters
1133+
----------
1134+
obj : object
1135+
Object to check.
1136+
11261137
Returns
11271138
-------
11281139
bool
11291140

1141+
See Also
1142+
--------
1143+
api.types.is_scalar : Check if the input is a scalar.
1144+
api.types.is_integer : Check if the input is an integer.
1145+
api.types.is_float : Check if the input is a float.
1146+
11301147
Examples
11311148
--------
11321149
>>> pd.api.types.is_bool(True)
@@ -1142,10 +1159,22 @@ def is_complex(obj: object) -> bool:
11421159
"""
11431160
Return True if given object is complex.
11441161

1162+
Parameters
1163+
----------
1164+
obj : object
1165+
Object to check.
1166+
11451167
Returns
11461168
-------
11471169
bool
11481170

1171+
See Also
1172+
--------
1173+
api.types.is_complex_dtype: Check whether the provided array or
1174+
dtype is of a complex dtype.
1175+
api.types.is_number: Check if the object is a number.
1176+
api.types.is_integer: Return True if given object is integer.
1177+
11491178
Examples
11501179
--------
11511180
>>> pd.api.types.is_complex(1 + 1j)

pandas/_libs/tslibs/period.pyx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2001,6 +2001,12 @@ cdef class _Period(PeriodMixin):
20012001
-------
20022002
Timestamp
20032003

2004+
See Also
2005+
--------
2006+
Timestamp : A class representing a single point in time.
2007+
Period : Represents a span of time with a fixed frequency.
2008+
PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`.
2009+
20042010
Examples
20052011
--------
20062012
>>> period = pd.Period('2023-1-1', freq='D')

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,11 +1458,26 @@ cdef class _Timedelta(timedelta):
14581458
"""
14591459
Array view compatibility.
14601460
1461+
This method allows you to reinterpret the underlying data of a Timedelta
1462+
object as a different dtype. The `view` method provides a way to reinterpret
1463+
the internal representation of the `Timedelta` object without modifying its
1464+
data. This is particularly useful when you need to work with the underlying
1465+
data directly, such as for performance optimizations or interfacing with
1466+
low-level APIs. The returned value is typically the number of nanoseconds
1467+
since the epoch, represented as an integer or another specified dtype.
1468+
14611469
Parameters
14621470
----------
14631471
dtype : str or dtype
14641472
The dtype to view the underlying data as.
14651473
1474+
See Also
1475+
--------
1476+
numpy.ndarray.view : Returns a view of an array with the same data.
1477+
Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64.
1478+
Timedelta.total_seconds : Returns the total duration of the Timedelta
1479+
object in seconds.
1480+
14661481
Examples
14671482
--------
14681483
>>> td = pd.Timedelta('3D')

pandas/conftest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,34 @@ def string_dtype(request):
12721272
return request.param
12731273

12741274

1275+
@pytest.fixture(
1276+
params=[
1277+
("python", pd.NA),
1278+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1279+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1280+
("python", np.nan),
1281+
],
1282+
ids=[
1283+
"string=string[python]",
1284+
"string=string[pyarrow]",
1285+
"string=str[pyarrow]",
1286+
"string=str[python]",
1287+
],
1288+
)
1289+
def string_dtype_no_object(request):
1290+
"""
1291+
Parametrized fixture for string dtypes.
1292+
* 'string[python]' (NA variant)
1293+
* 'string[pyarrow]' (NA variant)
1294+
* 'str' (NaN variant, with pyarrow)
1295+
* 'str' (NaN variant, without pyarrow)
1296+
"""
1297+
# need to instantiate the StringDtype here instead of in the params
1298+
# to avoid importing pyarrow during test collection
1299+
storage, na_value = request.param
1300+
return pd.StringDtype(storage, na_value)
1301+
1302+
12751303
@pytest.fixture(
12761304
params=[
12771305
"string[python]",

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
from functools import partial
44
from typing import (
55
TYPE_CHECKING,
6+
Any,
67
Literal,
78
)
89

910
import numpy as np
1011

1112
from pandas.compat import (
1213
pa_version_under10p1,
14+
pa_version_under11p0,
15+
pa_version_under13p0,
1316
pa_version_under17p0,
1417
)
1518

@@ -20,7 +23,7 @@
2023
import pyarrow.compute as pc
2124

2225
if TYPE_CHECKING:
23-
from collections.abc import Sized
26+
from collections.abc import Callable
2427

2528
from pandas._typing import (
2629
Scalar,
@@ -29,7 +32,7 @@
2932

3033

3134
class ArrowStringArrayMixin:
32-
_pa_array: Sized
35+
_pa_array: pa.ChunkedArray
3336

3437
def __init__(self, *args, **kwargs) -> None:
3538
raise NotImplementedError
@@ -42,6 +45,9 @@ def _convert_int_result(self, result):
4245
# Convert an integer-dtype result to the appropriate result type
4346
raise NotImplementedError
4447

48+
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
49+
raise NotImplementedError
50+
4551
def _str_pad(
4652
self,
4753
width: int,
@@ -88,13 +94,29 @@ def _str_get(self, i: int) -> Self:
8894
selected = pc.utf8_slice_codeunits(
8995
self._pa_array, start=start, stop=stop, step=step
9096
)
91-
null_value = pa.scalar(
92-
None,
93-
type=self._pa_array.type, # type: ignore[attr-defined]
94-
)
97+
null_value = pa.scalar(None, type=self._pa_array.type)
9598
result = pc.if_else(not_out_of_bounds, selected, null_value)
9699
return type(self)(result)
97100

101+
def _str_slice(
102+
self, start: int | None = None, stop: int | None = None, step: int | None = None
103+
) -> Self:
104+
if pa_version_under11p0:
105+
# GH#59724
106+
result = self._apply_elementwise(lambda val: val[start:stop:step])
107+
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
108+
if start is None:
109+
if step is not None and step < 0:
110+
# GH#59710
111+
start = -1
112+
else:
113+
start = 0
114+
if step is None:
115+
step = 1
116+
return type(self)(
117+
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
118+
)
119+
98120
def _str_slice_replace(
99121
self, start: int | None = None, stop: int | None = None, repl: str | None = None
100122
) -> Self:
@@ -205,3 +227,37 @@ def _str_contains(
205227
if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
206228
result = result.fill_null(na)
207229
return self._convert_bool_result(result)
230+
231+
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
232+
if (
233+
pa_version_under13p0
234+
and not (start != 0 and end is not None)
235+
and not (start == 0 and end is None)
236+
):
237+
# GH#59562
238+
res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
239+
return self._convert_int_result(pa.chunked_array(res_list))
240+
241+
if (start == 0 or start is None) and end is None:
242+
result = pc.find_substring(self._pa_array, sub)
243+
else:
244+
if sub == "":
245+
# GH#56792
246+
res_list = self._apply_elementwise(
247+
lambda val: val.find(sub, start, end)
248+
)
249+
return self._convert_int_result(pa.chunked_array(res_list))
250+
if start is None:
251+
start_offset = 0
252+
start = 0
253+
elif start < 0:
254+
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
255+
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
256+
else:
257+
start_offset = start
258+
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
259+
result = pc.find_substring(slices, sub)
260+
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
261+
offset_result = pc.add(result, start_offset)
262+
result = pc.if_else(found, offset_result, -1)
263+
return self._convert_int_result(result)

pandas/core/arrays/arrow/_arrow_utils.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,8 @@
11
from __future__ import annotations
22

3-
import warnings
4-
53
import numpy as np
64
import pyarrow
75

8-
from pandas._config.config import get_option
9-
10-
from pandas.errors import PerformanceWarning
11-
from pandas.util._exceptions import find_stack_level
12-
13-
14-
def fallback_performancewarning(version: str | None = None) -> None:
15-
"""
16-
Raise a PerformanceWarning for falling back to ExtensionArray's
17-
non-pyarrow method
18-
"""
19-
if get_option("performance_warnings"):
20-
msg = "Falling back on a non-pyarrow code path which may decrease performance."
21-
if version is not None:
22-
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
23-
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
24-
256

267
def pyarrow_array_to_numpy_and_mask(
278
arr, dtype: np.dtype

0 commit comments

Comments
 (0)