Skip to content

Commit 0af06a2

Browse files
committed
Switch merge over to use the proper index
This commit changes the index source to fix the bug. It also corrects many tests which tested for the old indexes.
1 parent 9c026db commit 0af06a2

File tree

3 files changed

+65
-89
lines changed

3 files changed

+65
-89
lines changed

pandas/core/reshape/merge.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,18 +1018,12 @@ def __init__(
10181018
raise ValueError(
10191019
f'Index "{index}" is not supported for merges on both indexes.'
10201020
)
1021+
elif index is not None:
1022+
pass
10211023
elif self.right_index:
1022-
if len(self.left) > 0:
1023-
index = "left"
1024-
else:
1025-
index = "right"
1024+
index = "right"
10261025
elif self.left_index:
1027-
if self.how == "asof":
1028-
index = "left"
1029-
elif len(self.right) > 0:
1030-
index = "right"
1031-
else:
1032-
index = "left"
1026+
index = "left"
10331027
else:
10341028
index = "reset"
10351029

@@ -2172,6 +2166,7 @@ def __init__(
21722166
how=how,
21732167
suffixes=suffixes,
21742168
sort=True, # factorize sorts
2169+
index="left" if left_index or right_index else "reset",
21752170
)
21762171

21772172
def get_result(self) -> DataFrame:

pandas/tests/reshape/merge/test_join.py

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -436,24 +436,7 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):
436436
sort=False,
437437
)
438438

439-
expected2 = merge(
440-
to_join,
441-
data,
442-
right_on=["key1", "key2"],
443-
left_index=True,
444-
how="inner",
445-
sort=False,
446-
)
447-
tm.assert_frame_equal(joined, expected2.reindex_like(joined))
448-
449-
expected2 = merge(
450-
to_join,
451-
data,
452-
right_on=["key1", "key2"],
453-
left_index=True,
454-
how="inner",
455-
sort=False,
456-
)
439+
tm.assert_frame_equal(joined.reset_index(drop=True), expected[joined.columns])
457440

458441
expected = expected.drop(["first", "second"], axis=1)
459442
expected.index = joined.index

pandas/tests/reshape/merge/test_merge.py

Lines changed: 59 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -162,21 +162,28 @@ def test_merge_index_singlekey_inner(self):
162162
{
163163
"key": ["a", "b", "c", "d", "e", "e", "a"],
164164
"v1": np.random.default_rng(2).standard_normal(7),
165+
"left_index": range(7),
165166
}
166167
)
168+
167169
right = DataFrame(
168170
{"v2": np.random.default_rng(2).standard_normal(4)},
169171
index=["d", "b", "c", "a"],
170172
)
171173

172174
# inner join
173175
result = merge(left, right, left_on="key", right_index=True, how="inner")
174-
expected = left.join(right, on="key").loc[result.index]
175-
tm.assert_frame_equal(result, expected)
176+
expected = left.join(right, on="key").loc[result["left_index"]]
177+
tm.assert_frame_equal(
178+
result.reset_index(drop=True), expected.reset_index(drop=True)
179+
)
176180

177181
result = merge(right, left, right_on="key", left_index=True, how="inner")
178-
expected = left.join(right, on="key").loc[result.index]
179-
tm.assert_frame_equal(result, expected.loc[:, result.columns])
182+
expected = left.join(right, on="key").loc[result["left_index"]]
183+
tm.assert_frame_equal(
184+
result.reset_index(drop=True),
185+
expected.loc[:, result.columns].reset_index(drop=True),
186+
)
180187

181188
def test_merge_misspecified(self, df, df2, left):
182189
right = DataFrame(
@@ -349,8 +356,9 @@ def test_handle_join_key_pass_array(self):
349356
right = DataFrame({"rvalue": np.arange(6)})
350357

351358
key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
359+
index = np.array([0, 1, 1, 2, 2, np.nan], dtype=np.float64)
352360
merged = merge(left, right, left_index=True, right_on=key, how="outer")
353-
tm.assert_series_equal(merged["key_0"], Series(key, name="key_0"))
361+
tm.assert_series_equal(merged["key_0"], Series(key, index=index, name="key_0"))
354362

355363
def test_no_overlap_more_informative_error(self):
356364
dt = datetime.now()
@@ -453,6 +461,9 @@ def test_merge_left_empty_right_notempty(self):
453461
)
454462
exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
455463

464+
exp_nan = exp_out.copy()
465+
exp_nan.index = [np.nan] * 3
466+
456467
def check1(exp, kwarg):
457468
result = merge(left, right, how="inner", **kwarg)
458469
tm.assert_frame_equal(result, exp)
@@ -465,12 +476,13 @@ def check2(exp, kwarg):
465476
result = merge(left, right, how="outer", **kwarg)
466477
tm.assert_frame_equal(result, exp)
467478

468-
for kwarg in [
469-
{"left_index": True, "right_index": True},
470-
{"left_index": True, "right_on": "x"},
471-
]:
472-
check1(exp_in, kwarg)
473-
check2(exp_out, kwarg)
479+
kwarg = {"left_index": True, "right_on": "x"}
480+
check1(exp_in, kwarg)
481+
check2(exp_nan, kwarg)
482+
483+
kwarg = {"left_index": True, "right_index": True}
484+
check1(exp_in, kwarg)
485+
check2(exp_out, kwarg)
474486

475487
kwarg = {"left_on": "a", "right_index": True}
476488
check1(exp_in, kwarg)
@@ -762,6 +774,7 @@ def test_other_datetime_unit(self, unit):
762774
"days": days,
763775
},
764776
columns=["entity_id", "days"],
777+
index=[101, 102],
765778
)
766779
assert exp["days"].dtype == exp_dtype
767780
tm.assert_frame_equal(result, exp)
@@ -789,6 +802,7 @@ def test_other_timedelta_unit(self, unit):
789802
exp = DataFrame(
790803
{"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)},
791804
columns=["entity_id", "days"],
805+
index=[101, 102],
792806
)
793807
tm.assert_frame_equal(result, exp)
794808

@@ -1190,7 +1204,7 @@ def test_validation(self):
11901204
"c": ["meow", "bark", "um... weasel noise?", "nay"],
11911205
},
11921206
columns=["b", "a", "c"],
1193-
index=range(4),
1207+
index=Index(["a", "b", "c", "d"], name="a"),
11941208
)
11951209

11961210
left_index_reset = left.set_index("a")
@@ -1331,48 +1345,17 @@ def test_merge_two_empty_df_no_division_error(self):
13311345

13321346
@pytest.mark.parametrize("how", ["right", "outer"])
13331347
@pytest.mark.parametrize(
1334-
"index,expected_index",
1348+
"index",
13351349
[
1336-
(
1337-
CategoricalIndex([1, 2, 4]),
1338-
CategoricalIndex([1, 2, 4, None, None, None]),
1339-
),
1340-
(
1341-
DatetimeIndex(
1342-
["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]"
1343-
),
1344-
DatetimeIndex(
1345-
["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT],
1346-
dtype="M8[ns]",
1347-
),
1348-
),
1349-
*[
1350-
(
1351-
Index([1, 2, 3], dtype=dtyp),
1352-
Index([1, 2, 3, None, None, None], dtype=np.float64),
1353-
)
1354-
for dtyp in tm.ALL_REAL_NUMPY_DTYPES
1355-
],
1356-
(
1357-
IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
1358-
IntervalIndex.from_tuples(
1359-
[(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan]
1360-
),
1361-
),
1362-
(
1363-
PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"),
1364-
PeriodIndex(
1365-
["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT],
1366-
freq="D",
1367-
),
1368-
),
1369-
(
1370-
TimedeltaIndex(["1D", "2D", "3D"]),
1371-
TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]),
1372-
),
1350+
CategoricalIndex([1, 2, 4]),
1351+
DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]"),
1352+
*[Index([1, 2, 3], dtype=dtyp) for dtyp in tm.ALL_REAL_NUMPY_DTYPES],
1353+
IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
1354+
PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"),
1355+
TimedeltaIndex(["1D", "2D", "3D"]),
13731356
],
13741357
)
1375-
def test_merge_on_index_with_more_values(self, how, index, expected_index):
1358+
def test_merge_on_index_with_more_values(self, how, index):
13761359
# GH 24212
13771360
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
13781361
# -1 is interpreted as a missing value instead of the last element
@@ -1390,20 +1373,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
13901373
],
13911374
columns=["a", "key", "b"],
13921375
)
1393-
expected.set_index(expected_index, inplace=True)
13941376
tm.assert_frame_equal(result, expected)
13951377

13961378
def test_merge_right_index_right(self):
1397-
# Note: the expected output here is probably incorrect.
1398-
# See https://github.com/pandas-dev/pandas/issues/17257 for more.
1399-
# We include this as a regression test for GH-24897.
1379+
# Regression test for GH-24897.
14001380
left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]})
14011381
right = DataFrame({"b": [1, 2, 3]})
14021382

14031383
expected = DataFrame(
14041384
{"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]},
14051385
columns=["a", "key", "b"],
1406-
index=[0, 1, 2, np.nan],
1386+
index=[0, 1, 1, 2],
14071387
)
14081388
result = left.merge(right, left_on="key", right_index=True, how="right")
14091389
tm.assert_frame_equal(result, expected)
@@ -1436,7 +1416,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self):
14361416
"key": Categorical(["a", "a", "b", "c"]),
14371417
"b": [1, 1, 2, 3],
14381418
},
1439-
index=[0, 1, 2, np.nan],
1419+
index=Categorical(["a", "a", "b", "c"], categories=list("abc")),
14401420
)
14411421
expected = expected.reindex(columns=["a", "key", "b"])
14421422
tm.assert_frame_equal(result, expected)
@@ -2661,7 +2641,8 @@ def test_merge_right_left_index():
26612641
"z_x": ["foo", "foo"],
26622642
"x_y": [1, 1],
26632643
"z_y": ["foo", "foo"],
2664-
}
2644+
},
2645+
index=[1, 1],
26652646
)
26662647
tm.assert_frame_equal(result, expected)
26672648

@@ -2670,7 +2651,7 @@ def test_merge_result_empty_index_and_on():
26702651
# GH#33814
26712652
df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"])
26722653
df2 = DataFrame({"b": [1]}).set_index(["b"])
2673-
expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"])
2654+
expected = DataFrame({"b": []}, dtype=np.int64).set_index(["b"])
26742655
result = merge(df1, df2, left_on=["b"], right_index=True)
26752656
tm.assert_frame_equal(result, expected)
26762657

@@ -2850,7 +2831,9 @@ def test_merge_multiindex_single_level():
28502831
data={"b": [100]},
28512832
index=MultiIndex.from_tuples([("A",), ("C",)], names=["col"]),
28522833
)
2853-
expected = DataFrame({"col": ["A", "B"], "b": [100, np.nan]})
2834+
expected = DataFrame(
2835+
{"col": ["A", "B"], "b": [100, np.nan]}, index=Index([("A",), np.nan])
2836+
)
28542837

28552838
result = df.merge(df2, left_on=["col"], right_index=True, how="left")
28562839
tm.assert_frame_equal(result, expected)
@@ -2957,14 +2940,20 @@ def test_merge_ea_int_and_float_numpy():
29572940
tm.assert_frame_equal(result, expected.astype("float64"))
29582941

29592942

2943+
from pandas.core.dtypes.missing import na_value_for_dtype
2944+
2945+
29602946
def test_merge_arrow_string_index(any_string_dtype):
29612947
# GH#54894
29622948
pytest.importorskip("pyarrow")
29632949
left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype)
29642950
right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype))
29652951
result = left.merge(right, left_on="a", right_index=True, how="left")
29662952
expected = DataFrame(
2967-
{"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]}
2953+
{"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1.0, np.nan]},
2954+
)
2955+
expected.index = Index(["a"], dtype=any_string_dtype).append(
2956+
Index([na_value_for_dtype(any_string_dtype)])
29682957
)
29692958
tm.assert_frame_equal(result, expected)
29702959

@@ -3022,3 +3011,12 @@ def test_merge_on_all_nan_column():
30223011
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
30233012
)
30243013
tm.assert_frame_equal(result, expected)
3014+
3015+
3016+
def test_merge_index():
3017+
# GH 57291
3018+
dfa = DataFrame(range(10), columns=["a"])
3019+
dfb = DataFrame({"b": range(5), "key": [5 + x for x in range(5)]})
3020+
3021+
result = dfa.merge(dfb, left_index=True, right_on="key", how="left")
3022+
tm.assert_index_equal(result.index, dfa.index)

0 commit comments

Comments
 (0)