Skip to content

Commit dfea777

Browse files
Merge branch 'main' into main
2 parents b72042e + 80b6850 commit dfea777

File tree

16 files changed

+391
-222
lines changed

16 files changed

+391
-222
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7373
-i "pandas.NA SA01" \
7474
-i "pandas.Period.freq GL08" \
7575
-i "pandas.Period.ordinal GL08" \
76-
-i "pandas.Period.to_timestamp SA01" \
7776
-i "pandas.PeriodDtype.freq SA01" \
7877
-i "pandas.RangeIndex.from_range PR01,SA01" \
7978
-i "pandas.RangeIndex.start SA01" \
@@ -114,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
114113
-i "pandas.Timedelta.resolution PR02" \
115114
-i "pandas.Timedelta.to_timedelta64 SA01" \
116115
-i "pandas.Timedelta.total_seconds SA01" \
117-
-i "pandas.Timedelta.view SA01" \
118116
-i "pandas.TimedeltaIndex.nanoseconds SA01" \
119117
-i "pandas.TimedeltaIndex.seconds SA01" \
120118
-i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,7 @@ Reshaping
668668
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
669669
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
670670
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
671+
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
671672
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
672673

673674
Sparse

pandas/_libs/tslibs/period.pyx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2001,6 +2001,12 @@ cdef class _Period(PeriodMixin):
20012001
-------
20022002
Timestamp
20032003

2004+
See Also
2005+
--------
2006+
Timestamp : A class representing a single point in time.
2007+
Period : Represents a span of time with a fixed frequency.
2008+
PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`.
2009+
20042010
Examples
20052011
--------
20062012
>>> period = pd.Period('2023-1-1', freq='D')

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,11 +1458,26 @@ cdef class _Timedelta(timedelta):
14581458
"""
14591459
Array view compatibility.
14601460
1461+
This method allows you to reinterpret the underlying data of a Timedelta
1462+
object as a different dtype. The `view` method provides a way to reinterpret
1463+
the internal representation of the `Timedelta` object without modifying its
1464+
data. This is particularly useful when you need to work with the underlying
1465+
data directly, such as for performance optimizations or interfacing with
1466+
low-level APIs. The returned value is typically the number of nanoseconds
1467+
since the epoch, represented as an integer or another specified dtype.
1468+
14611469
Parameters
14621470
----------
14631471
dtype : str or dtype
14641472
The dtype to view the underlying data as.
14651473
1474+
See Also
1475+
--------
1476+
numpy.ndarray.view : Returns a view of an array with the same data.
1477+
Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64.
1478+
Timedelta.total_seconds : Returns the total duration of the Timedelta
1479+
object in seconds.
1480+
14661481
Examples
14671482
--------
14681483
>>> td = pd.Timedelta('3D')

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
from functools import partial
44
from typing import (
55
TYPE_CHECKING,
6+
Any,
67
Literal,
78
)
89

910
import numpy as np
1011

1112
from pandas.compat import (
1213
pa_version_under10p1,
14+
pa_version_under13p0,
1315
pa_version_under17p0,
1416
)
1517

@@ -20,7 +22,10 @@
2022
import pyarrow.compute as pc
2123

2224
if TYPE_CHECKING:
23-
from collections.abc import Sized
25+
from collections.abc import (
26+
Callable,
27+
Sized,
28+
)
2429

2530
from pandas._typing import (
2631
Scalar,
@@ -42,6 +47,9 @@ def _convert_int_result(self, result):
4247
# Convert an integer-dtype result to the appropriate result type
4348
raise NotImplementedError
4449

50+
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
51+
raise NotImplementedError
52+
4553
def _str_pad(
4654
self,
4755
width: int,
@@ -205,3 +213,37 @@ def _str_contains(
205213
if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
206214
result = result.fill_null(na)
207215
return self._convert_bool_result(result)
216+
217+
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
218+
if (
219+
pa_version_under13p0
220+
and not (start != 0 and end is not None)
221+
and not (start == 0 and end is None)
222+
):
223+
# GH#59562
224+
res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
225+
return self._convert_int_result(pa.chunked_array(res_list))
226+
227+
if (start == 0 or start is None) and end is None:
228+
result = pc.find_substring(self._pa_array, sub)
229+
else:
230+
if sub == "":
231+
# GH#56792
232+
res_list = self._apply_elementwise(
233+
lambda val: val.find(sub, start, end)
234+
)
235+
return self._convert_int_result(pa.chunked_array(res_list))
236+
if start is None:
237+
start_offset = 0
238+
start = 0
239+
elif start < 0:
240+
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
241+
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
242+
else:
243+
start_offset = start
244+
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
245+
result = pc.find_substring(slices, sub)
246+
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
247+
offset_result = pc.add(result, start_offset)
248+
result = pc.if_else(found, offset_result, -1)
249+
return self._convert_int_result(result)

pandas/core/arrays/arrow/array.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2373,29 +2373,6 @@ def _str_fullmatch(
23732373
pat = f"{pat}$"
23742374
return self._str_match(pat, case, flags, na)
23752375

2376-
def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self:
2377-
if (start == 0 or start is None) and end is None:
2378-
result = pc.find_substring(self._pa_array, sub)
2379-
else:
2380-
if sub == "":
2381-
# GH 56792
2382-
result = self._apply_elementwise(lambda val: val.find(sub, start, end))
2383-
return type(self)(pa.chunked_array(result))
2384-
if start is None:
2385-
start_offset = 0
2386-
start = 0
2387-
elif start < 0:
2388-
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
2389-
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
2390-
else:
2391-
start_offset = start
2392-
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
2393-
result = pc.find_substring(slices, sub)
2394-
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
2395-
offset_result = pc.add(result, start_offset)
2396-
result = pc.if_else(found, offset_result, -1)
2397-
return type(self)(result)
2398-
23992376
def _str_join(self, sep: str) -> Self:
24002377
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
24012378
self._pa_array.type

pandas/core/arrays/string_.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,9 @@ def __init__(
171171
# a consistent NaN value (and we can use `dtype.na_value is np.nan`)
172172
na_value = np.nan
173173
elif na_value is not libmissing.NA:
174-
raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
174+
raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}")
175175

176-
self.storage = storage
176+
self.storage = cast(str, storage)
177177
self._na_value = na_value
178178

179179
def __repr__(self) -> str:
@@ -284,6 +284,34 @@ def construct_array_type( # type: ignore[override]
284284
else:
285285
return ArrowStringArrayNumpySemantics
286286

287+
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
288+
storages = set()
289+
na_values = set()
290+
291+
for dtype in dtypes:
292+
if isinstance(dtype, StringDtype):
293+
storages.add(dtype.storage)
294+
na_values.add(dtype.na_value)
295+
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"):
296+
continue
297+
else:
298+
return None
299+
300+
if len(storages) == 2:
301+
# if both python and pyarrow storage -> priority to pyarrow
302+
storage = "pyarrow"
303+
else:
304+
storage = next(iter(storages)) # type: ignore[assignment]
305+
306+
na_value: libmissing.NAType | float
307+
if len(na_values) == 2:
308+
# if both NaN and NA -> priority to NA
309+
na_value = libmissing.NA
310+
else:
311+
na_value = next(iter(na_values))
312+
313+
return StringDtype(storage=storage, na_value=na_value)
314+
287315
def __from_arrow__(
288316
self, array: pyarrow.Array | pyarrow.ChunkedArray
289317
) -> BaseStringArray:

pandas/core/arrays/string_arrow.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -416,18 +416,14 @@ def _str_count(self, pat: str, flags: int = 0):
416416
return self._convert_int_result(result)
417417

418418
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
419-
if start != 0 and end is not None:
420-
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
421-
result = pc.find_substring(slices, sub)
422-
not_found = pc.equal(result, -1)
423-
offset_result = pc.add(result, end - start)
424-
result = pc.if_else(not_found, result, offset_result)
425-
elif start == 0 and end is None:
426-
slices = self._pa_array
427-
result = pc.find_substring(slices, sub)
428-
else:
419+
if (
420+
pa_version_under13p0
421+
and not (start != 0 and end is not None)
422+
and not (start == 0 and end is None)
423+
):
424+
# GH#59562
429425
return super()._str_find(sub, start, end)
430-
return self._convert_int_result(result)
426+
return ArrowStringArrayMixin._str_find(self, sub, start, end)
431427

432428
def _str_get_dummies(self, sep: str = "|"):
433429
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)

0 commit comments

Comments
 (0)