Skip to content

Commit 9c026db

Browse files
committed
Add choice of index source internally
This commit introduces an index argument that tells merging how to choose the output index. I moved the logic determining which index to use into _MergeOperation.__init__ while moving the logic for calculating the index to the end of the _get_join_info method. Note: I did not adjust the logic in the case when merging on both indexes (or an index and a multiindex). This commit also introduces a function merge_pick_index so that internal uses of merge can pick where they get the index from. Specifically, join uses merge under the hood, but wants the old merge index sources. Now, join specifies the index source so it wont be affected by changing merge's index behavior. This commit should not affect behavior.
1 parent 220c18d commit 9c026db

File tree

2 files changed

+100
-52
lines changed

2 files changed

+100
-52
lines changed

pandas/core/frame.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10700,7 +10700,7 @@ def join(
1070010700
5 K1 A5 B1
1070110701
"""
1070210702
from pandas.core.reshape.concat import concat
10703-
from pandas.core.reshape.merge import merge
10703+
from pandas.core.reshape.merge import merge_pick_index
1070410704

1070510705
if isinstance(other, Series):
1070610706
if other.name is None:
@@ -10709,16 +10709,17 @@ def join(
1070910709

1071010710
if isinstance(other, DataFrame):
1071110711
if how == "cross":
10712-
return merge(
10712+
return merge_pick_index(
1071310713
self,
1071410714
other,
1071510715
how=how,
1071610716
on=on,
1071710717
suffixes=(lsuffix, rsuffix),
1071810718
sort=sort,
1071910719
validate=validate,
10720+
index="left",
1072010721
)
10721-
return merge(
10722+
return merge_pick_index(
1072210723
self,
1072310724
other,
1072410725
left_on=on,
@@ -10728,6 +10729,7 @@ def join(
1072810729
suffixes=(lsuffix, rsuffix),
1072910730
sort=sort,
1073010731
validate=validate,
10732+
index=None if on is None else "left",
1073110733
)
1073210734
else:
1073310735
if on is not None:
@@ -10763,7 +10765,7 @@ def join(
1076310765
joined = frames[0]
1076410766

1076510767
for frame in frames[1:]:
10766-
joined = merge(
10768+
joined = merge_pick_index(
1076710769
joined,
1076810770
frame,
1076910771
how=how,

pandas/core/reshape/merge.py

Lines changed: 94 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,51 @@ def merge(
347347
2 bar 7
348348
3 bar 8
349349
"""
350+
return merge_pick_index(
351+
left,
352+
right,
353+
how,
354+
on,
355+
left_on,
356+
right_on,
357+
left_index,
358+
right_index,
359+
sort,
360+
suffixes,
361+
copy,
362+
indicator,
363+
validate,
364+
)
365+
366+
367+
def merge_pick_index(
368+
left: DataFrame | Series,
369+
right: DataFrame | Series,
370+
how: MergeHow = "inner",
371+
on: IndexLabel | AnyArrayLike | None = None,
372+
left_on: IndexLabel | AnyArrayLike | None = None,
373+
right_on: IndexLabel | AnyArrayLike | None = None,
374+
left_index: bool = False,
375+
right_index: bool = False,
376+
sort: bool = False,
377+
suffixes: Suffixes = ("_x", "_y"),
378+
copy: bool | lib.NoDefault = lib.no_default,
379+
indicator: str | bool = False,
380+
validate: str | None = None,
381+
index: Literal["left", "right", "reset"] | None = None,
382+
) -> DataFrame:
383+
"""A helper function for merge that returns a specified index.
384+
385+
If index is "left" or "right" then the returned DataFrame will
386+
use the index from the left or right DataFrames respectively.
387+
388+
If index is "reset" then the DataFrame will have the default
389+
index: zero for the first row, one for the second, etc.
390+
391+
If index is None then the value will be inferred based on the
392+
merge. If merging on both indexes then None is the only accepted
393+
value.
394+
"""
350395
left_df = _validate_operand(left)
351396
left._check_copy_deprecation(copy)
352397
right_df = _validate_operand(right)
@@ -378,6 +423,7 @@ def merge(
378423
suffixes=suffixes,
379424
indicator=indicator,
380425
validate=validate,
426+
index=index,
381427
)
382428
return op.get_result()
383429

@@ -932,6 +978,7 @@ class _MergeOperation:
932978
join_names: list[Hashable]
933979
right_join_keys: list[ArrayLike]
934980
left_join_keys: list[ArrayLike]
981+
index: Literal["left", "right", "reset"] | None
935982

936983
def __init__(
937984
self,
@@ -947,6 +994,7 @@ def __init__(
947994
suffixes: Suffixes = ("_x", "_y"),
948995
indicator: str | bool = False,
949996
validate: str | None = None,
997+
index: Literal["left", "right", "reset"] | None = None,
950998
) -> None:
951999
_left = _validate_operand(left)
9521000
_right = _validate_operand(right)
@@ -964,6 +1012,29 @@ def __init__(
9641012

9651013
self.indicator = indicator
9661014

1015+
# Identify which index will be used for the output
1016+
if self.left_index and self.right_index and self.how != "asof":
1017+
if index is not None:
1018+
raise ValueError(
1019+
f'Index "{index}" is not supported for merges on both indexes.'
1020+
)
1021+
elif self.right_index:
1022+
if len(self.left) > 0:
1023+
index = "left"
1024+
else:
1025+
index = "right"
1026+
elif self.left_index:
1027+
if self.how == "asof":
1028+
index = "left"
1029+
elif len(self.right) > 0:
1030+
index = "right"
1031+
else:
1032+
index = "left"
1033+
else:
1034+
index = "reset"
1035+
1036+
self.index = index
1037+
9671038
if not is_bool(left_index):
9681039
raise ValueError(
9691040
f"left_index parameter must be of type bool, not {type(left_index)}"
@@ -1341,53 +1412,32 @@ def _get_join_info(
13411412
)
13421413

13431414
elif self.right_index and self.how == "left":
1344-
join_index, left_indexer, right_indexer = _left_join_on_index(
1415+
left_indexer, right_indexer = _left_join_on_index(
13451416
left_ax, right_ax, self.left_join_keys, sort=self.sort
13461417
)
13471418

13481419
elif self.left_index and self.how == "right":
1349-
join_index, right_indexer, left_indexer = _left_join_on_index(
1420+
right_indexer, left_indexer = _left_join_on_index(
13501421
right_ax, left_ax, self.right_join_keys, sort=self.sort
13511422
)
13521423
else:
1353-
(left_indexer, right_indexer) = self._get_join_indexers()
1424+
left_indexer, right_indexer = self._get_join_indexers()
13541425

1355-
if self.right_index:
1356-
if len(self.left) > 0:
1357-
join_index = self._create_join_index(
1358-
left_ax,
1359-
right_ax,
1360-
left_indexer,
1361-
how="right",
1362-
)
1363-
elif right_indexer is None:
1364-
join_index = right_ax.copy()
1365-
else:
1366-
join_index = right_ax.take(right_indexer)
1367-
elif self.left_index:
1368-
if self.how == "asof":
1369-
# GH#33463 asof should always behave like a left merge
1370-
join_index = self._create_join_index(
1371-
left_ax,
1372-
right_ax,
1373-
left_indexer,
1374-
how="left",
1375-
)
1376-
1377-
elif len(self.right) > 0:
1378-
join_index = self._create_join_index(
1379-
right_ax,
1380-
left_ax,
1381-
right_indexer,
1382-
how="left",
1383-
)
1384-
elif left_indexer is None:
1385-
join_index = left_ax.copy()
1386-
else:
1387-
join_index = left_ax.take(left_indexer)
1388-
else:
1389-
n = len(left_ax) if left_indexer is None else len(left_indexer)
1390-
join_index = default_index(n)
1426+
if self.index == "left":
1427+
join_index = self._create_join_index(
1428+
left_ax,
1429+
right_ax,
1430+
left_indexer,
1431+
)
1432+
elif self.index == "right":
1433+
join_index = self._create_join_index(
1434+
right_ax,
1435+
left_ax,
1436+
right_indexer,
1437+
)
1438+
elif self.index == "reset":
1439+
n = len(left_ax) if left_indexer is None else len(left_indexer)
1440+
join_index = default_index(n)
13911441

13921442
return join_index, left_indexer, right_indexer
13931443

@@ -1397,7 +1447,6 @@ def _create_join_index(
13971447
index: Index,
13981448
other_index: Index,
13991449
indexer: npt.NDArray[np.intp] | None,
1400-
how: JoinHow = "left",
14011450
) -> Index:
14021451
"""
14031452
Create a join index by rearranging one index to match another
@@ -1407,17 +1456,15 @@ def _create_join_index(
14071456
index : Index
14081457
index being rearranged
14091458
other_index : Index
1410-
used to supply values not found in index
1459+
do not fill with nulls if the other_index is a MultiIndex
14111460
indexer : np.ndarray[np.intp] or None
14121461
how to rearrange index
1413-
how : str
1414-
Replacement is only necessary if indexer based on other_index.
14151462
14161463
Returns
14171464
-------
14181465
Index
14191466
"""
1420-
if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
1467+
if not isinstance(other_index, MultiIndex):
14211468
# if final index requires values in other_index but not target
14221469
# index, indexer may hold missing (-1) values, causing Index.take
14231470
# to take the final value in target index. So, we set the last
@@ -2574,7 +2621,7 @@ def _get_no_sort_one_missing_indexer(
25742621

25752622
def _left_join_on_index(
25762623
left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False
2577-
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
2624+
) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
25782625
if isinstance(right_ax, MultiIndex):
25792626
lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort)
25802627
else:
@@ -2593,11 +2640,10 @@ def _left_join_on_index(
25932640

25942641
if sort or len(left_ax) != len(left_indexer):
25952642
# if asked to sort or there are 1-to-many matches
2596-
join_index = left_ax.take(left_indexer)
2597-
return join_index, left_indexer, right_indexer
2643+
return left_indexer, right_indexer
25982644

25992645
# left frame preserves order & length of its index
2600-
return left_ax, None, right_indexer
2646+
return None, right_indexer
26012647

26022648

26032649
def _factorize_keys(

0 commit comments

Comments
 (0)