diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da0d85b7bb529..1d6f401b26e4e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -664,6 +664,7 @@ Reshaping - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) +- Bug in :meth:`DataFrame.merge` where merging on an index could return a dataframe with the wrong index. (:issue:`57291`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b84fb33af26e5..41da5d041c38c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10700,7 +10700,7 @@ def join( 5 K1 A5 B1 """ from pandas.core.reshape.concat import concat - from pandas.core.reshape.merge import merge + from pandas.core.reshape.merge import merge_pick_index if isinstance(other, Series): if other.name is None: @@ -10709,7 +10709,7 @@ def join( if isinstance(other, DataFrame): if how == "cross": - return merge( + return merge_pick_index( self, other, how=how, @@ -10717,8 +10717,9 @@ def join( suffixes=(lsuffix, rsuffix), sort=sort, validate=validate, + index="left", ) - return merge( + return merge_pick_index( self, other, left_on=on, @@ -10728,6 +10729,7 @@ def join( suffixes=(lsuffix, rsuffix), sort=sort, validate=validate, + index=None if on is None else "left", ) else: if on is not None: @@ -10763,7 +10765,7 @@ def join( joined = frames[0] for frame in frames[1:]: - joined = merge( + joined = merge_pick_index( joined, frame, how=how, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 07e8fa4841c04..563f61e4d5662 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -347,6 +347,51 @@ def merge( 2 bar 7 3 bar 8 """ + return merge_pick_index( + left, + right, + how, + on, + left_on, + right_on, + left_index, + right_index, + sort, + suffixes, + copy, + indicator, + validate, + ) + + +def merge_pick_index( + left: DataFrame | Series, + right: DataFrame | Series, + how: MergeHow = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool | lib.NoDefault = lib.no_default, + indicator: str | bool = False, + validate: str | None = None, + index: Literal["left", "right", "reset"] | None = None, +) -> DataFrame: + """A helper function for merge that returns a specified index. + + If index is "left" or "right" then the returned DataFrame will + use the index from the left or right DataFrames respectively. + + If index is "reset" then the DataFrame will have the default + index: zero for the first row, one for the second, etc. + + If index is None then the value will be inferred based on the + merge. If merging on both indexes then None is the only accepted + value. + """ left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) @@ -378,6 +423,7 @@ def merge( suffixes=suffixes, indicator=indicator, validate=validate, + index=index, ) return op.get_result() @@ -932,6 +978,7 @@ class _MergeOperation: join_names: list[Hashable] right_join_keys: list[ArrayLike] left_join_keys: list[ArrayLike] + index: Literal["left", "right", "reset"] | None def __init__( self, @@ -947,6 +994,7 @@ def __init__( suffixes: Suffixes = ("_x", "_y"), indicator: str | bool = False, validate: str | None = None, + index: Literal["left", "right", "reset"] | None = None, ) -> None: _left = _validate_operand(left) _right = _validate_operand(right) @@ -964,6 +1012,23 @@ def __init__( self.indicator = indicator + # Identify which index will be used for the output + if self.left_index and self.right_index and self.how != "asof": + if index is not None: + raise ValueError( + f'Index "{index}" is not supported for merges on both indexes.' + ) + elif index is not None: + pass + elif self.right_index: + index = "right" + elif self.left_index: + index = "left" + else: + index = "reset" + + self.index = index + if not is_bool(left_index): raise ValueError( f"left_index parameter must be of type bool, not {type(left_index)}" @@ -1341,53 +1406,32 @@ def _get_join_info( ) elif self.right_index and self.how == "left": - join_index, left_indexer, right_indexer = _left_join_on_index( + left_indexer, right_indexer = _left_join_on_index( left_ax, right_ax, self.left_join_keys, sort=self.sort ) elif self.left_index and self.how == "right": - join_index, right_indexer, left_indexer = _left_join_on_index( + right_indexer, left_indexer = _left_join_on_index( right_ax, left_ax, self.right_join_keys, sort=self.sort ) else: - (left_indexer, right_indexer) = self._get_join_indexers() + left_indexer, right_indexer = self._get_join_indexers() - if self.right_index: - if len(self.left) > 0: - join_index = self._create_join_index( - left_ax, - right_ax, - left_indexer, - how="right", - ) - elif right_indexer is None: - join_index = right_ax.copy() - else: - join_index = right_ax.take(right_indexer) - elif self.left_index: - if self.how == "asof": - # GH#33463 asof should always behave like a left merge - join_index = self._create_join_index( - left_ax, - right_ax, - left_indexer, - how="left", - ) - - elif len(self.right) > 0: - join_index = self._create_join_index( - right_ax, - left_ax, - right_indexer, - how="left", - ) - elif left_indexer is None: - join_index = left_ax.copy() - else: - join_index = left_ax.take(left_indexer) - else: - n = len(left_ax) if left_indexer is None else len(left_indexer) - join_index = default_index(n) + if self.index == "left": + join_index = self._create_join_index( + left_ax, + right_ax, + left_indexer, + ) + elif self.index == "right": + join_index = self._create_join_index( + right_ax, + left_ax, + right_indexer, + ) + elif self.index == "reset": + n = len(left_ax) if left_indexer is None else len(left_indexer) + join_index = default_index(n) return join_index, left_indexer, right_indexer @@ -1397,7 +1441,6 @@ def _create_join_index( index: Index, other_index: Index, indexer: npt.NDArray[np.intp] | None, - how: JoinHow = "left", ) -> Index: """ Create a join index by rearranging one index to match another @@ -1407,17 +1450,15 @@ def _create_join_index( index : Index index being rearranged other_index : Index - used to supply values not found in index + do not fill with nulls if the other_index is a MultiIndex indexer : np.ndarray[np.intp] or None how to rearrange index - how : str - Replacement is only necessary if indexer based on other_index. Returns ------- Index """ - if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): + if not isinstance(other_index, MultiIndex): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take # to take the final value in target index. So, we set the last @@ -2125,6 +2166,7 @@ def __init__( how=how, suffixes=suffixes, sort=True, # factorize sorts + index="left" if left_index or right_index else "reset", ) def get_result(self) -> DataFrame: @@ -2574,7 +2616,7 @@ def _get_no_sort_one_missing_indexer( def _left_join_on_index( left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False -) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: +) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: if isinstance(right_ax, MultiIndex): lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort) else: @@ -2593,11 +2635,10 @@ def _left_join_on_index( if sort or len(left_ax) != len(left_indexer): # if asked to sort or there are 1-to-many matches - join_index = left_ax.take(left_indexer) - return join_index, left_indexer, right_indexer + return left_indexer, right_indexer # left frame preserves order & length of its index - return left_ax, None, right_indexer + return None, right_indexer def _factorize_keys( diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 0f743332acbbe..1afeac051d66c 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -436,24 +436,7 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex): sort=False, ) - expected2 = merge( - to_join, - data, - right_on=["key1", "key2"], - left_index=True, - how="inner", - sort=False, - ) - tm.assert_frame_equal(joined, expected2.reindex_like(joined)) - - expected2 = merge( - to_join, - data, - right_on=["key1", "key2"], - left_index=True, - how="inner", - sort=False, - ) + tm.assert_frame_equal(joined.reset_index(drop=True), expected[joined.columns]) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d4766242b8460..d8cad84c52d5b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -162,8 +162,10 @@ def test_merge_index_singlekey_inner(self): { "key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.default_rng(2).standard_normal(7), + "left_index": range(7), } ) + right = DataFrame( {"v2": np.random.default_rng(2).standard_normal(4)}, index=["d", "b", "c", "a"], @@ -171,12 +173,17 @@ def test_merge_index_singlekey_inner(self): # inner join result = merge(left, right, left_on="key", right_index=True, how="inner") - expected = left.join(right, on="key").loc[result.index] - tm.assert_frame_equal(result, expected) + expected = left.join(right, on="key").loc[result["left_index"]] + tm.assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) result = merge(right, left, right_on="key", left_index=True, how="inner") - expected = left.join(right, on="key").loc[result.index] - tm.assert_frame_equal(result, expected.loc[:, result.columns]) + expected = left.join(right, on="key").loc[result["left_index"]] + tm.assert_frame_equal( + result.reset_index(drop=True), + expected.loc[:, result.columns].reset_index(drop=True), + ) def test_merge_misspecified(self, df, df2, left): right = DataFrame( @@ -349,8 +356,9 @@ def test_handle_join_key_pass_array(self): right = DataFrame({"rvalue": np.arange(6)}) key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) + index = np.array([0, 1, 1, 2, 2, np.nan], dtype=np.float64) merged = merge(left, right, left_index=True, right_on=key, how="outer") - tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) + tm.assert_series_equal(merged["key_0"], Series(key, index=index, name="key_0")) def test_no_overlap_more_informative_error(self): dt = datetime.now() @@ -453,6 +461,9 @@ def test_merge_left_empty_right_notempty(self): ) exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + exp_nan = exp_out.copy() + exp_nan.index = [np.nan] * 3 + def check1(exp, kwarg): result = merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) @@ -465,12 +476,13 @@ def check2(exp, kwarg): result = merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) - for kwarg in [ - {"left_index": True, "right_index": True}, - {"left_index": True, "right_on": "x"}, - ]: - check1(exp_in, kwarg) - check2(exp_out, kwarg) + kwarg = {"left_index": True, "right_on": "x"} + check1(exp_in, kwarg) + check2(exp_nan, kwarg) + + kwarg = {"left_index": True, "right_index": True} + check1(exp_in, kwarg) + check2(exp_out, kwarg) kwarg = {"left_on": "a", "right_index": True} check1(exp_in, kwarg) @@ -762,6 +774,7 @@ def test_other_datetime_unit(self, unit): "days": days, }, columns=["entity_id", "days"], + index=[101, 102], ) assert exp["days"].dtype == exp_dtype tm.assert_frame_equal(result, exp) @@ -789,6 +802,7 @@ def test_other_timedelta_unit(self, unit): exp = DataFrame( {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, columns=["entity_id", "days"], + index=[101, 102], ) tm.assert_frame_equal(result, exp) @@ -1190,7 +1204,7 @@ def test_validation(self): "c": ["meow", "bark", "um... weasel noise?", "nay"], }, columns=["b", "a", "c"], - index=range(4), + index=Index(["a", "b", "c", "d"], name="a"), ) left_index_reset = left.set_index("a") @@ -1331,48 +1345,17 @@ def test_merge_two_empty_df_no_division_error(self): @pytest.mark.parametrize("how", ["right", "outer"]) @pytest.mark.parametrize( - "index,expected_index", + "index", [ - ( - CategoricalIndex([1, 2, 4]), - CategoricalIndex([1, 2, 4, None, None, None]), - ), - ( - DatetimeIndex( - ["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]" - ), - DatetimeIndex( - ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT], - dtype="M8[ns]", - ), - ), - *[ - ( - Index([1, 2, 3], dtype=dtyp), - Index([1, 2, 3, None, None, None], dtype=np.float64), - ) - for dtyp in tm.ALL_REAL_NUMPY_DTYPES - ], - ( - IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), - IntervalIndex.from_tuples( - [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan] - ), - ), - ( - PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), - PeriodIndex( - ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT], - freq="D", - ), - ), - ( - TimedeltaIndex(["1D", "2D", "3D"]), - TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]), - ), + CategoricalIndex([1, 2, 4]), + DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]"), + *[Index([1, 2, 3], dtype=dtyp) for dtyp in tm.ALL_REAL_NUMPY_DTYPES], + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), + TimedeltaIndex(["1D", "2D", "3D"]), ], ) - def test_merge_on_index_with_more_values(self, how, index, expected_index): + def test_merge_on_index_with_more_values(self, how, index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element @@ -1390,20 +1373,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): ], columns=["a", "key", "b"], ) - expected.set_index(expected_index, inplace=True) tm.assert_frame_equal(result, expected) def test_merge_right_index_right(self): - # Note: the expected output here is probably incorrect. - # See https://github.com/pandas-dev/pandas/issues/17257 for more. - # We include this as a regression test for GH-24897. + # Regression test for GH-24897. left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) right = DataFrame({"b": [1, 2, 3]}) expected = DataFrame( {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], - index=[0, 1, 2, np.nan], + index=[0, 1, 1, 2], ) result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) @@ -1436,7 +1416,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): "key": Categorical(["a", "a", "b", "c"]), "b": [1, 1, 2, 3], }, - index=[0, 1, 2, np.nan], + index=Categorical(["a", "a", "b", "c"], categories=list("abc")), ) expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) @@ -2661,7 +2641,8 @@ def test_merge_right_left_index(): "z_x": ["foo", "foo"], "x_y": [1, 1], "z_y": ["foo", "foo"], - } + }, + index=[1, 1], ) tm.assert_frame_equal(result, expected) @@ -2670,7 +2651,7 @@ def test_merge_result_empty_index_and_on(): # GH#33814 df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"]) df2 = DataFrame({"b": [1]}).set_index(["b"]) - expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"]) + expected = DataFrame({"b": []}, dtype=np.int64).set_index(["b"]) result = merge(df1, df2, left_on=["b"], right_index=True) tm.assert_frame_equal(result, expected) @@ -2850,7 +2831,9 @@ def test_merge_multiindex_single_level(): data={"b": [100]}, index=MultiIndex.from_tuples([("A",), ("C",)], names=["col"]), ) - expected = DataFrame({"col": ["A", "B"], "b": [100, np.nan]}) + expected = DataFrame( + {"col": ["A", "B"], "b": [100, np.nan]}, index=Index([("A",), np.nan]) + ) result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) @@ -2957,6 +2940,9 @@ def test_merge_ea_int_and_float_numpy(): tm.assert_frame_equal(result, expected.astype("float64")) +from pandas.core.dtypes.missing import na_value_for_dtype + + def test_merge_arrow_string_index(any_string_dtype): # GH#54894 pytest.importorskip("pyarrow") @@ -2964,7 +2950,10 @@ def test_merge_arrow_string_index(any_string_dtype): right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) result = left.merge(right, left_on="a", right_index=True, how="left") expected = DataFrame( - {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1.0, np.nan]}, + ) + expected.index = Index(["a"], dtype=any_string_dtype).append( + Index([na_value_for_dtype(any_string_dtype)]) ) tm.assert_frame_equal(result, expected) @@ -3022,3 +3011,12 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +def test_merge_index(): + # GH 57291 + dfa = DataFrame(range(10), columns=["a"]) + dfb = DataFrame({"b": range(5), "key": [5 + x for x in range(5)]}) + + result = dfa.merge(dfb, left_index=True, right_on="key", how="left") + tm.assert_index_equal(result.index, dfa.index)