Skip to content

Commit f829533

Browse files
Merge remote-tracking branch 'upstream/main' into stringmethods-get-dummies
2 parents 151316d + fa5c255 commit f829533

File tree

25 files changed

+442
-62
lines changed

25 files changed

+442
-62
lines changed

doc/source/reference/window.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Rolling window functions
3535
Rolling.skew
3636
Rolling.kurt
3737
Rolling.apply
38+
Rolling.pipe
3839
Rolling.aggregate
3940
Rolling.quantile
4041
Rolling.sem
@@ -76,6 +77,7 @@ Expanding window functions
7677
Expanding.skew
7778
Expanding.kurt
7879
Expanding.apply
80+
Expanding.pipe
7981
Expanding.aggregate
8082
Expanding.quantile
8183
Expanding.sem

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ Other enhancements
3535
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
3636
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
3737
updated to work correctly with NumPy >= 2 (:issue:`57739`)
38+
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
3839
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
39-
-
4040

4141
.. ---------------------------------------------------------------------------
4242
.. _whatsnew_230.notable_bug_fixes:

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Other enhancements
3030
^^^^^^^^^^^^^^^^^^
3131
- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`)
3232
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
33+
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3334
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3435
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
3536
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
@@ -44,6 +45,7 @@ Other enhancements
4445
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
4546
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
4647
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
48+
- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`)
4749
- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
4850
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
4951
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)

pandas/_libs/src/parser/tokenizer.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ int parser_init(parser_t *self) {
148148
self->warn_msg = NULL;
149149

150150
// token stream
151-
self->stream = malloc(STREAM_INIT_SIZE * sizeof(char));
151+
self->stream = malloc(STREAM_INIT_SIZE);
152152
if (self->stream == NULL) {
153153
parser_cleanup(self);
154154
return PARSER_OUT_OF_MEMORY;
@@ -221,9 +221,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
221221
char *orig_ptr = (void *)self->stream;
222222
TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n",
223223
nbytes))
224-
self->stream =
225-
(char *)grow_buffer((void *)self->stream, self->stream_len,
226-
&self->stream_cap, nbytes * 2, sizeof(char), &status);
224+
self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len,
225+
&self->stream_cap, nbytes * 2, 1, &status);
227226
TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, "
228227
"self->stream_cap=%zu, status=%zu\n",
229228
self->stream, self->stream_len, self->stream_cap, status))

pandas/_libs/src/vendored/ujson/python/objToJSON.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -984,7 +984,7 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj),
984984
//=============================================================================
985985
static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
986986
GET_TC(tc)->index = 0;
987-
GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
987+
GET_TC(tc)->cStr = PyObject_Malloc(20);
988988
if (!GET_TC(tc)->cStr) {
989989
PyErr_NoMemory();
990990
}
@@ -998,10 +998,10 @@ static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) {
998998
const Py_ssize_t index = GET_TC(tc)->index;
999999
Py_XDECREF(GET_TC(tc)->itemValue);
10001000
if (index == 0) {
1001-
memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
1001+
memcpy(GET_TC(tc)->cStr, "name", 5);
10021002
GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
10031003
} else if (index == 1) {
1004-
memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
1004+
memcpy(GET_TC(tc)->cStr, "data", 5);
10051005
GET_TC(tc)->itemValue = get_values(obj);
10061006
if (!GET_TC(tc)->itemValue) {
10071007
return 0;
@@ -1033,7 +1033,7 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
10331033
static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
10341034
PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
10351035
GET_TC(tc)->index = 0;
1036-
GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
1036+
GET_TC(tc)->cStr = PyObject_Malloc(20);
10371037
enc->outputFormat = VALUES; // for contained series
10381038
if (!GET_TC(tc)->cStr) {
10391039
PyErr_NoMemory();
@@ -1048,13 +1048,13 @@ static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) {
10481048
const Py_ssize_t index = GET_TC(tc)->index;
10491049
Py_XDECREF(GET_TC(tc)->itemValue);
10501050
if (index == 0) {
1051-
memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
1051+
memcpy(GET_TC(tc)->cStr, "name", 5);
10521052
GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
10531053
} else if (index == 1) {
1054-
memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
1054+
memcpy(GET_TC(tc)->cStr, "index", 6);
10551055
GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
10561056
} else if (index == 2) {
1057-
memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
1057+
memcpy(GET_TC(tc)->cStr, "data", 5);
10581058
GET_TC(tc)->itemValue = get_values(obj);
10591059
if (!GET_TC(tc)->itemValue) {
10601060
return 0;
@@ -1088,7 +1088,7 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
10881088
static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
10891089
PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
10901090
GET_TC(tc)->index = 0;
1091-
GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
1091+
GET_TC(tc)->cStr = PyObject_Malloc(20);
10921092
enc->outputFormat = VALUES; // for contained series & index
10931093
if (!GET_TC(tc)->cStr) {
10941094
PyErr_NoMemory();
@@ -1103,13 +1103,13 @@ static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) {
11031103
const Py_ssize_t index = GET_TC(tc)->index;
11041104
Py_XDECREF(GET_TC(tc)->itemValue);
11051105
if (index == 0) {
1106-
memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8);
1106+
memcpy(GET_TC(tc)->cStr, "columns", 8);
11071107
GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns");
11081108
} else if (index == 1) {
1109-
memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
1109+
memcpy(GET_TC(tc)->cStr, "index", 6);
11101110
GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
11111111
} else if (index == 2) {
1112-
memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
1112+
memcpy(GET_TC(tc)->cStr, "data", 5);
11131113
Py_INCREF(obj);
11141114
GET_TC(tc)->itemValue = obj;
11151115
} else {

pandas/_libs/tslibs/period.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,7 @@ cdef char* c_strftime(npy_datetimestruct *dts, char *fmt):
679679
c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1
680680
c_date.tm_isdst = -1
681681

682-
result = <char*>malloc(result_len * sizeof(char))
682+
result = <char*>malloc(result_len)
683683
if result is NULL:
684684
raise MemoryError()
685685

pandas/api/typing/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from pandas._libs import NaTType
6+
from pandas._libs.lib import NoDefault
67
from pandas._libs.missing import NAType
78

89
from pandas.core.groupby import (
@@ -44,6 +45,7 @@
4445
"JsonReader",
4546
"NAType",
4647
"NaTType",
48+
"NoDefault",
4749
"PeriodIndexResamplerGroupby",
4850
"Resampler",
4951
"Rolling",

pandas/conftest.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,6 +1317,22 @@ def nullable_string_dtype(request):
13171317
return request.param
13181318

13191319

1320+
@pytest.fixture(
1321+
params=[
1322+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1323+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1324+
]
1325+
)
1326+
def pyarrow_string_dtype(request):
1327+
"""
1328+
Parametrized fixture for string dtypes backed by Pyarrow.
1329+
1330+
* 'str[pyarrow]'
1331+
* 'string[pyarrow]'
1332+
"""
1333+
return pd.StringDtype(*request.param)
1334+
1335+
13201336
@pytest.fixture(
13211337
params=[
13221338
"python",

pandas/core/algorithms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1012,7 +1012,7 @@ def mode(
10121012
return npresult, res_mask # type: ignore[return-value]
10131013

10141014
try:
1015-
npresult = np.sort(npresult)
1015+
npresult = safe_sort(npresult)
10161016
except TypeError as err:
10171017
warnings.warn(
10181018
f"Unable to sort modes: {err}",

pandas/core/arrays/arrow/array.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
is_list_like,
4242
is_numeric_dtype,
4343
is_scalar,
44+
is_string_dtype,
4445
pandas_dtype,
4546
)
4647
from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -1619,6 +1620,9 @@ def _accumulate(
16191620
------
16201621
NotImplementedError : subclass does not define accumulations
16211622
"""
1623+
if is_string_dtype(self):
1624+
return self._str_accumulate(name=name, skipna=skipna, **kwargs)
1625+
16221626
pyarrow_name = {
16231627
"cummax": "cumulative_max",
16241628
"cummin": "cumulative_min",
@@ -1654,6 +1658,57 @@ def _accumulate(
16541658

16551659
return type(self)(result)
16561660

1661+
def _str_accumulate(
1662+
self, name: str, *, skipna: bool = True, **kwargs
1663+
) -> ArrowExtensionArray | ExtensionArray:
1664+
"""
1665+
Accumulate implementation for strings, see `_accumulate` docstring for details.
1666+
1667+
pyarrow.compute does not implement these methods for strings.
1668+
"""
1669+
if name == "cumprod":
1670+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
1671+
raise TypeError(msg)
1672+
1673+
# We may need to strip out trailing NA values
1674+
tail: pa.array | None = None
1675+
na_mask: pa.array | None = None
1676+
pa_array = self._pa_array
1677+
np_func = {
1678+
"cumsum": np.cumsum,
1679+
"cummin": np.minimum.accumulate,
1680+
"cummax": np.maximum.accumulate,
1681+
}[name]
1682+
1683+
if self._hasna:
1684+
na_mask = pc.is_null(pa_array)
1685+
if pc.all(na_mask) == pa.scalar(True):
1686+
return type(self)(pa_array)
1687+
if skipna:
1688+
if name == "cumsum":
1689+
pa_array = pc.fill_null(pa_array, "")
1690+
else:
1691+
# We can retain the running min/max by forward/backward filling.
1692+
pa_array = pc.fill_null_forward(pa_array)
1693+
pa_array = pc.fill_null_backward(pa_array)
1694+
else:
1695+
# When not skipping NA values, the result should be null from
1696+
# the first NA value onward.
1697+
idx = pc.index(na_mask, True).as_py()
1698+
tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
1699+
pa_array = pa_array[:idx]
1700+
1701+
# error: Cannot call function of unknown type
1702+
pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator]
1703+
1704+
if tail is not None:
1705+
pa_result = pa.concat_arrays([pa_result, tail])
1706+
elif na_mask is not None:
1707+
pa_result = pc.if_else(na_mask, None, pa_result)
1708+
1709+
result = type(self)(pa_result)
1710+
return result
1711+
16571712
def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar:
16581713
"""
16591714
Return a pyarrow scalar result of performing the reduction operation.

0 commit comments

Comments
 (0)