Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)

.. ---------------------------------------------------------------------------
Expand Down
63 changes: 31 additions & 32 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
return indexer.view(bool)


cdef _maybe_resize_array(ndarray values, loc, max_length):
"""
Resize array if loc is out of bounds.
"""
cdef:
Py_ssize_t n = len(values)

if loc >= n:
while loc >= n:
n *= 2
values = np.resize(values, min(n, max_length))
return values


# Don't populate hash tables in monotonic indexes larger than this
_SIZE_CUTOFF = 1_000_000

Expand Down Expand Up @@ -456,27 +470,18 @@ cdef class IndexEngine:
# found
if val in d:
key = val

result = _maybe_resize_array(
result,
count + len(d[key]) - 1,
max_alloc
)
for j in d[key]:

# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)

result[count] = j
count += 1

# value not found
else:

if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)
result = _maybe_resize_array(result, count, max_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
Expand Down Expand Up @@ -1214,37 +1219,31 @@ cdef class MaskedIndexEngine(IndexEngine):

if PySequence_GetItem(target_mask, i):
if na_pos:
result = _maybe_resize_array(
result,
count + len(na_pos) - 1,
max_alloc,
)
for na_idx in na_pos:
# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = na_idx
count += 1
continue

elif val in d:
# found
key = val

result = _maybe_resize_array(
result,
count + len(d[key]) - 1,
max_alloc,
)
for j in d[key]:

# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = j
count += 1
continue

# value not found
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result = _maybe_resize_array(result, count, max_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3347,3 +3347,15 @@ def test_getitem_loc_str_periodindex(self):
index = pd.period_range(start="2000", periods=20, freq="B")
series = Series(range(20), index=index)
assert series.loc["2000-01-14"] == 9

def test_loc_nonunique_masked_index(self):
# GH 57027
ids = list(range(11))
index = Index(ids * 1000, dtype="Int64")
df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index)
result = df.loc[ids]
expected = DataFrame(
{"val": index.argsort(kind="stable").astype(np.intp)},
index=Index(np.array(ids).repeat(1000), dtype="Int64"),
)
tm.assert_frame_equal(result, expected)