Skip to content

Commit c32fd1f

Browse files
authored
Merge branch 'main' into bugfix-spss-kwargs
2 parents 186b457 + b6fb905 commit c32fd1f

File tree

16 files changed

+231
-62
lines changed

16 files changed

+231
-62
lines changed

doc/source/whatsnew/v2.2.1.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
1818
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
1919
- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
20+
- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
2021
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
2122
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
2223
- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`)
@@ -27,14 +28,15 @@ Fixed regressions
2728

2829
Bug fixes
2930
~~~~~~~~~
30-
-
31+
- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`)
3132

3233
.. ---------------------------------------------------------------------------
3334
.. _whatsnew_221.other:
3435

3536
Other
3637
~~~~~
37-
-
38+
- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`)
39+
- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`)
3840

3941
.. ---------------------------------------------------------------------------
4042
.. _whatsnew_221.contributors:

pandas/_libs/groupby.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def group_last(
136136
result_mask: npt.NDArray[np.bool_] | None = ...,
137137
min_count: int = ..., # Py_ssize_t
138138
is_datetimelike: bool = ...,
139+
skipna: bool = ...,
139140
) -> None: ...
140141
def group_nth(
141142
out: np.ndarray, # rank_t[:, ::1]
@@ -147,6 +148,7 @@ def group_nth(
147148
min_count: int = ..., # int64_t
148149
rank: int = ..., # int64_t
149150
is_datetimelike: bool = ...,
151+
skipna: bool = ...,
150152
) -> None: ...
151153
def group_rank(
152154
out: np.ndarray, # float64_t[:, ::1]

pandas/_libs/groupby.pyx

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,7 @@ def group_last(
14281428
uint8_t[:, ::1] result_mask=None,
14291429
Py_ssize_t min_count=-1,
14301430
bint is_datetimelike=False,
1431+
bint skipna=True,
14311432
) -> None:
14321433
"""
14331434
Only aggregates on axis=0
@@ -1462,14 +1463,19 @@ def group_last(
14621463
for j in range(K):
14631464
val = values[i, j]
14641465

1465-
if uses_mask:
1466-
isna_entry = mask[i, j]
1467-
else:
1468-
isna_entry = _treat_as_na(val, is_datetimelike)
1466+
if skipna:
1467+
if uses_mask:
1468+
isna_entry = mask[i, j]
1469+
else:
1470+
isna_entry = _treat_as_na(val, is_datetimelike)
1471+
if isna_entry:
1472+
continue
14691473

1470-
if not isna_entry:
1471-
nobs[lab, j] += 1
1472-
resx[lab, j] = val
1474+
nobs[lab, j] += 1
1475+
resx[lab, j] = val
1476+
1477+
if uses_mask and not skipna:
1478+
result_mask[lab, j] = mask[i, j]
14731479

14741480
_check_below_mincount(
14751481
out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
@@ -1490,6 +1496,7 @@ def group_nth(
14901496
int64_t min_count=-1,
14911497
int64_t rank=1,
14921498
bint is_datetimelike=False,
1499+
bint skipna=True,
14931500
) -> None:
14941501
"""
14951502
Only aggregates on axis=0
@@ -1524,15 +1531,19 @@ def group_nth(
15241531
for j in range(K):
15251532
val = values[i, j]
15261533

1527-
if uses_mask:
1528-
isna_entry = mask[i, j]
1529-
else:
1530-
isna_entry = _treat_as_na(val, is_datetimelike)
1534+
if skipna:
1535+
if uses_mask:
1536+
isna_entry = mask[i, j]
1537+
else:
1538+
isna_entry = _treat_as_na(val, is_datetimelike)
1539+
if isna_entry:
1540+
continue
15311541

1532-
if not isna_entry:
1533-
nobs[lab, j] += 1
1534-
if nobs[lab, j] == rank:
1535-
resx[lab, j] = val
1542+
nobs[lab, j] += 1
1543+
if nobs[lab, j] == rank:
1544+
resx[lab, j] = val
1545+
if uses_mask and not skipna:
1546+
result_mask[lab, j] = mask[i, j]
15361547

15371548
_check_below_mincount(
15381549
out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx

pandas/_libs/index.pyx

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
9898
return indexer.view(bool)
9999

100100

101+
cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length):
102+
"""
103+
Resize array if loc is out of bounds.
104+
"""
105+
cdef:
106+
Py_ssize_t n = len(values)
107+
108+
if loc >= n:
109+
while loc >= n:
110+
n *= 2
111+
values = np.resize(values, min(n, max_length))
112+
return values
113+
114+
101115
# Don't populate hash tables in monotonic indexes larger than this
102116
_SIZE_CUTOFF = 1_000_000
103117

@@ -456,27 +470,18 @@ cdef class IndexEngine:
456470
# found
457471
if val in d:
458472
key = val
459-
473+
result = _maybe_resize_array(
474+
result,
475+
count + len(d[key]) - 1,
476+
max_alloc
477+
)
460478
for j in d[key]:
461-
462-
# realloc if needed
463-
if count >= n_alloc:
464-
n_alloc *= 2
465-
if n_alloc > max_alloc:
466-
n_alloc = max_alloc
467-
result = np.resize(result, n_alloc)
468-
469479
result[count] = j
470480
count += 1
471481

472482
# value not found
473483
else:
474-
475-
if count >= n_alloc:
476-
n_alloc *= 2
477-
if n_alloc > max_alloc:
478-
n_alloc = max_alloc
479-
result = np.resize(result, n_alloc)
484+
result = _maybe_resize_array(result, count, max_alloc)
480485
result[count] = -1
481486
count += 1
482487
missing[count_missing] = i
@@ -1214,37 +1219,31 @@ cdef class MaskedIndexEngine(IndexEngine):
12141219

12151220
if PySequence_GetItem(target_mask, i):
12161221
if na_pos:
1222+
result = _maybe_resize_array(
1223+
result,
1224+
count + len(na_pos) - 1,
1225+
max_alloc,
1226+
)
12171227
for na_idx in na_pos:
1218-
# realloc if needed
1219-
if count >= n_alloc:
1220-
n_alloc *= 2
1221-
if n_alloc > max_alloc:
1222-
n_alloc = max_alloc
1223-
12241228
result[count] = na_idx
12251229
count += 1
12261230
continue
12271231

12281232
elif val in d:
12291233
# found
12301234
key = val
1231-
1235+
result = _maybe_resize_array(
1236+
result,
1237+
count + len(d[key]) - 1,
1238+
max_alloc,
1239+
)
12321240
for j in d[key]:
1233-
1234-
# realloc if needed
1235-
if count >= n_alloc:
1236-
n_alloc *= 2
1237-
if n_alloc > max_alloc:
1238-
n_alloc = max_alloc
1239-
12401241
result[count] = j
12411242
count += 1
12421243
continue
12431244

12441245
# value not found
1245-
if count >= n_alloc:
1246-
n_alloc += 10_000
1247-
result = np.resize(result, n_alloc)
1246+
result = _maybe_resize_array(result, count, max_alloc)
12481247
result[count] = -1
12491248
count += 1
12501249
missing[count_missing] = i

pandas/_libs/ops.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ from pandas._libs.util cimport is_nan
2929

3030
@cython.wraparound(False)
3131
@cython.boundscheck(False)
32-
def scalar_compare(object[:] values, object val, object op) -> ndarray:
32+
def scalar_compare(ndarray[object] values, object val, object op) -> ndarray:
3333
"""
3434
Compare each element of `values` array with the scalar `val`, with
3535
the comparison operation described by `op`.

pandas/_testing/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,18 @@
235235
+ TIMEDELTA_PYARROW_DTYPES
236236
+ BOOL_PYARROW_DTYPES
237237
)
238+
ALL_REAL_PYARROW_DTYPES_STR_REPR = (
239+
ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR
240+
)
238241
else:
239242
FLOAT_PYARROW_DTYPES_STR_REPR = []
240243
ALL_INT_PYARROW_DTYPES_STR_REPR = []
241244
ALL_PYARROW_DTYPES = []
245+
ALL_REAL_PYARROW_DTYPES_STR_REPR = []
242246

247+
ALL_REAL_NULLABLE_DTYPES = (
248+
FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR
249+
)
243250

244251
arithmetic_dunder_methods = [
245252
"__add__",

pandas/conftest.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1703,6 +1703,38 @@ def any_numpy_dtype(request):
17031703
return request.param
17041704

17051705

1706+
@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES)
1707+
def any_real_nullable_dtype(request):
1708+
"""
1709+
Parameterized fixture for all real dtypes that can hold NA.
1710+
1711+
* float
1712+
* 'float32'
1713+
* 'float64'
1714+
* 'Float32'
1715+
* 'Float64'
1716+
* 'UInt8'
1717+
* 'UInt16'
1718+
* 'UInt32'
1719+
* 'UInt64'
1720+
* 'Int8'
1721+
* 'Int16'
1722+
* 'Int32'
1723+
* 'Int64'
1724+
* 'uint8[pyarrow]'
1725+
* 'uint16[pyarrow]'
1726+
* 'uint32[pyarrow]'
1727+
* 'uint64[pyarrow]'
1728+
* 'int8[pyarrow]'
1729+
* 'int16[pyarrow]'
1730+
* 'int32[pyarrow]'
1731+
* 'int64[pyarrow]'
1732+
* 'float[pyarrow]'
1733+
* 'double[pyarrow]'
1734+
"""
1735+
return request.param
1736+
1737+
17061738
@pytest.fixture(params=tm.ALL_NUMERIC_DTYPES)
17071739
def any_numeric_dtype(request):
17081740
"""

pandas/core/common.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
236236
values = list(values)
237237
elif isinstance(values, ABCIndex):
238238
return values._values
239+
elif isinstance(values, ABCSeries):
240+
return values._values
239241

240242
if isinstance(values, list) and dtype in [np.object_, object]:
241243
return construct_1d_object_array_from_listlike(values)

pandas/core/groupby/groupby.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3364,22 +3364,31 @@ def max(
33643364
)
33653365

33663366
@final
3367-
def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT:
3367+
def first(
3368+
self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True
3369+
) -> NDFrameT:
33683370
"""
3369-
Compute the first non-null entry of each column.
3371+
Compute the first entry of each column within each group.
3372+
3373+
Defaults to skipping NA elements.
33703374
33713375
Parameters
33723376
----------
33733377
numeric_only : bool, default False
33743378
Include only float, int, boolean columns.
33753379
min_count : int, default -1
33763380
The required number of valid values to perform the operation. If fewer
3377-
than ``min_count`` non-NA values are present the result will be NA.
3381+
than ``min_count`` valid values are present the result will be NA.
3382+
skipna : bool, default True
3383+
Exclude NA/null values. If an entire row/column is NA, the result
3384+
will be NA.
3385+
3386+
.. versionadded:: 2.2.1
33783387
33793388
Returns
33803389
-------
33813390
Series or DataFrame
3382-
First non-null of values within each group.
3391+
First values within each group.
33833392
33843393
See Also
33853394
--------
@@ -3431,12 +3440,17 @@ def first(x: Series):
34313440
min_count=min_count,
34323441
alias="first",
34333442
npfunc=first_compat,
3443+
skipna=skipna,
34343444
)
34353445

34363446
@final
3437-
def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT:
3447+
def last(
3448+
self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True
3449+
) -> NDFrameT:
34383450
"""
3439-
Compute the last non-null entry of each column.
3451+
Compute the last entry of each column within each group.
3452+
3453+
Defaults to skipping NA elements.
34403454
34413455
Parameters
34423456
----------
@@ -3445,12 +3459,17 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT:
34453459
everything, then use only numeric data.
34463460
min_count : int, default -1
34473461
The required number of valid values to perform the operation. If fewer
3448-
than ``min_count`` non-NA values are present the result will be NA.
3462+
than ``min_count`` valid values are present the result will be NA.
3463+
skipna : bool, default True
3464+
Exclude NA/null values. If an entire row/column is NA, the result
3465+
will be NA.
3466+
3467+
.. versionadded:: 2.2.1
34493468
34503469
Returns
34513470
-------
34523471
Series or DataFrame
3453-
Last non-null of values within each group.
3472+
Last of values within each group.
34543473
34553474
See Also
34563475
--------
@@ -3490,6 +3509,7 @@ def last(x: Series):
34903509
min_count=min_count,
34913510
alias="last",
34923511
npfunc=last_compat,
3512+
skipna=skipna,
34933513
)
34943514

34953515
@final

0 commit comments

Comments
 (0)