From 9c026dbf7bce0e4955f6e6cc7977d7133a33ee54 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Sun, 11 Aug 2024 19:40:53 -0700 Subject: [PATCH 1/3] Add choice of index source internally This commit introduces an index argument that tells merging how to choose the output index. I moved the logic determining which index to use into _MergeOperation.__init__ while moving the logic for calculating the index to the end of the _get_join_info method. Note: I did not adjust the logic in the case when merging on both indexes (or an index and a multiindex). This commit also introduces a function merge_pick_index so that internal uses of merge can pick where they get the index from. Specifically, join uses merge under the hood, but wants the old merge index sources. Now, join specifies the index source so it wont be affected by changing merge's index behavior. This commit should not affect behavior. --- pandas/core/frame.py | 10 ++- pandas/core/reshape/merge.py | 142 +++++++++++++++++++++++------------ 2 files changed, 100 insertions(+), 52 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b84fb33af26e5..41da5d041c38c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10700,7 +10700,7 @@ def join( 5 K1 A5 B1 """ from pandas.core.reshape.concat import concat - from pandas.core.reshape.merge import merge + from pandas.core.reshape.merge import merge_pick_index if isinstance(other, Series): if other.name is None: @@ -10709,7 +10709,7 @@ def join( if isinstance(other, DataFrame): if how == "cross": - return merge( + return merge_pick_index( self, other, how=how, @@ -10717,8 +10717,9 @@ def join( suffixes=(lsuffix, rsuffix), sort=sort, validate=validate, + index="left", ) - return merge( + return merge_pick_index( self, other, left_on=on, @@ -10728,6 +10729,7 @@ def join( suffixes=(lsuffix, rsuffix), sort=sort, validate=validate, + index=None if on is None else "left", ) else: if on is not None: @@ -10763,7 +10765,7 @@ def join( joined = frames[0] for frame in frames[1:]: - joined = merge( + joined = merge_pick_index( joined, frame, how=how, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 07e8fa4841c04..8ddcb514b9574 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -347,6 +347,51 @@ def merge( 2 bar 7 3 bar 8 """ + return merge_pick_index( + left, + right, + how, + on, + left_on, + right_on, + left_index, + right_index, + sort, + suffixes, + copy, + indicator, + validate, + ) + + +def merge_pick_index( + left: DataFrame | Series, + right: DataFrame | Series, + how: MergeHow = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool | lib.NoDefault = lib.no_default, + indicator: str | bool = False, + validate: str | None = None, + index: Literal["left", "right", "reset"] | None = None, +) -> DataFrame: + """A helper function for merge that returns a specified index. + + If index is "left" or "right" then the returned DataFrame will + use the index from the left or right DataFrames respectively. + + If index is "reset" then the DataFrame will have the default + index: zero for the first row, one for the second, etc. + + If index is None then the value will be inferred based on the + merge. If merging on both indexes then None is the only accepted + value. + """ left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) @@ -378,6 +423,7 @@ def merge( suffixes=suffixes, indicator=indicator, validate=validate, + index=index, ) return op.get_result() @@ -932,6 +978,7 @@ class _MergeOperation: join_names: list[Hashable] right_join_keys: list[ArrayLike] left_join_keys: list[ArrayLike] + index: Literal["left", "right", "reset"] | None def __init__( self, @@ -947,6 +994,7 @@ def __init__( suffixes: Suffixes = ("_x", "_y"), indicator: str | bool = False, validate: str | None = None, + index: Literal["left", "right", "reset"] | None = None, ) -> None: _left = _validate_operand(left) _right = _validate_operand(right) @@ -964,6 +1012,29 @@ def __init__( self.indicator = indicator + # Identify which index will be used for the output + if self.left_index and self.right_index and self.how != "asof": + if index is not None: + raise ValueError( + f'Index "{index}" is not supported for merges on both indexes.' + ) + elif self.right_index: + if len(self.left) > 0: + index = "left" + else: + index = "right" + elif self.left_index: + if self.how == "asof": + index = "left" + elif len(self.right) > 0: + index = "right" + else: + index = "left" + else: + index = "reset" + + self.index = index + if not is_bool(left_index): raise ValueError( f"left_index parameter must be of type bool, not {type(left_index)}" @@ -1341,53 +1412,32 @@ def _get_join_info( ) elif self.right_index and self.how == "left": - join_index, left_indexer, right_indexer = _left_join_on_index( + left_indexer, right_indexer = _left_join_on_index( left_ax, right_ax, self.left_join_keys, sort=self.sort ) elif self.left_index and self.how == "right": - join_index, right_indexer, left_indexer = _left_join_on_index( + right_indexer, left_indexer = _left_join_on_index( right_ax, left_ax, self.right_join_keys, sort=self.sort ) else: - (left_indexer, right_indexer) = self._get_join_indexers() + left_indexer, right_indexer = self._get_join_indexers() - if self.right_index: - if len(self.left) > 0: - join_index = self._create_join_index( - left_ax, - right_ax, - left_indexer, - how="right", - ) - elif right_indexer is None: - join_index = right_ax.copy() - else: - join_index = right_ax.take(right_indexer) - elif self.left_index: - if self.how == "asof": - # GH#33463 asof should always behave like a left merge - join_index = self._create_join_index( - left_ax, - right_ax, - left_indexer, - how="left", - ) - - elif len(self.right) > 0: - join_index = self._create_join_index( - right_ax, - left_ax, - right_indexer, - how="left", - ) - elif left_indexer is None: - join_index = left_ax.copy() - else: - join_index = left_ax.take(left_indexer) - else: - n = len(left_ax) if left_indexer is None else len(left_indexer) - join_index = default_index(n) + if self.index == "left": + join_index = self._create_join_index( + left_ax, + right_ax, + left_indexer, + ) + elif self.index == "right": + join_index = self._create_join_index( + right_ax, + left_ax, + right_indexer, + ) + elif self.index == "reset": + n = len(left_ax) if left_indexer is None else len(left_indexer) + join_index = default_index(n) return join_index, left_indexer, right_indexer @@ -1397,7 +1447,6 @@ def _create_join_index( index: Index, other_index: Index, indexer: npt.NDArray[np.intp] | None, - how: JoinHow = "left", ) -> Index: """ Create a join index by rearranging one index to match another @@ -1407,17 +1456,15 @@ def _create_join_index( index : Index index being rearranged other_index : Index - used to supply values not found in index + do not fill with nulls if the other_index is a MultiIndex indexer : np.ndarray[np.intp] or None how to rearrange index - how : str - Replacement is only necessary if indexer based on other_index. Returns ------- Index """ - if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): + if not isinstance(other_index, MultiIndex): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take # to take the final value in target index. So, we set the last @@ -2574,7 +2621,7 @@ def _get_no_sort_one_missing_indexer( def _left_join_on_index( left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False -) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: +) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: if isinstance(right_ax, MultiIndex): lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort) else: @@ -2593,11 +2640,10 @@ def _left_join_on_index( if sort or len(left_ax) != len(left_indexer): # if asked to sort or there are 1-to-many matches - join_index = left_ax.take(left_indexer) - return join_index, left_indexer, right_indexer + return left_indexer, right_indexer # left frame preserves order & length of its index - return left_ax, None, right_indexer + return None, right_indexer def _factorize_keys( From 0af06a2d469bab04377d1775ac412cb7d4004ae4 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:33:07 -0500 Subject: [PATCH 2/3] Switch merge over to use the proper index This commit changes the index source to fix the bug. It also corrects many tests which tested for the old indexes. --- pandas/core/reshape/merge.py | 15 +-- pandas/tests/reshape/merge/test_join.py | 19 +--- pandas/tests/reshape/merge/test_merge.py | 120 +++++++++++------------ 3 files changed, 65 insertions(+), 89 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ddcb514b9574..563f61e4d5662 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1018,18 +1018,12 @@ def __init__( raise ValueError( f'Index "{index}" is not supported for merges on both indexes.' ) + elif index is not None: + pass elif self.right_index: - if len(self.left) > 0: - index = "left" - else: - index = "right" + index = "right" elif self.left_index: - if self.how == "asof": - index = "left" - elif len(self.right) > 0: - index = "right" - else: - index = "left" + index = "left" else: index = "reset" @@ -2172,6 +2166,7 @@ def __init__( how=how, suffixes=suffixes, sort=True, # factorize sorts + index="left" if left_index or right_index else "reset", ) def get_result(self) -> DataFrame: diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 0f743332acbbe..1afeac051d66c 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -436,24 +436,7 @@ def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex): sort=False, ) - expected2 = merge( - to_join, - data, - right_on=["key1", "key2"], - left_index=True, - how="inner", - sort=False, - ) - tm.assert_frame_equal(joined, expected2.reindex_like(joined)) - - expected2 = merge( - to_join, - data, - right_on=["key1", "key2"], - left_index=True, - how="inner", - sort=False, - ) + tm.assert_frame_equal(joined.reset_index(drop=True), expected[joined.columns]) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d4766242b8460..d8cad84c52d5b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -162,8 +162,10 @@ def test_merge_index_singlekey_inner(self): { "key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.default_rng(2).standard_normal(7), + "left_index": range(7), } ) + right = DataFrame( {"v2": np.random.default_rng(2).standard_normal(4)}, index=["d", "b", "c", "a"], @@ -171,12 +173,17 @@ def test_merge_index_singlekey_inner(self): # inner join result = merge(left, right, left_on="key", right_index=True, how="inner") - expected = left.join(right, on="key").loc[result.index] - tm.assert_frame_equal(result, expected) + expected = left.join(right, on="key").loc[result["left_index"]] + tm.assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) result = merge(right, left, right_on="key", left_index=True, how="inner") - expected = left.join(right, on="key").loc[result.index] - tm.assert_frame_equal(result, expected.loc[:, result.columns]) + expected = left.join(right, on="key").loc[result["left_index"]] + tm.assert_frame_equal( + result.reset_index(drop=True), + expected.loc[:, result.columns].reset_index(drop=True), + ) def test_merge_misspecified(self, df, df2, left): right = DataFrame( @@ -349,8 +356,9 @@ def test_handle_join_key_pass_array(self): right = DataFrame({"rvalue": np.arange(6)}) key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) + index = np.array([0, 1, 1, 2, 2, np.nan], dtype=np.float64) merged = merge(left, right, left_index=True, right_on=key, how="outer") - tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) + tm.assert_series_equal(merged["key_0"], Series(key, index=index, name="key_0")) def test_no_overlap_more_informative_error(self): dt = datetime.now() @@ -453,6 +461,9 @@ def test_merge_left_empty_right_notempty(self): ) exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + exp_nan = exp_out.copy() + exp_nan.index = [np.nan] * 3 + def check1(exp, kwarg): result = merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) @@ -465,12 +476,13 @@ def check2(exp, kwarg): result = merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) - for kwarg in [ - {"left_index": True, "right_index": True}, - {"left_index": True, "right_on": "x"}, - ]: - check1(exp_in, kwarg) - check2(exp_out, kwarg) + kwarg = {"left_index": True, "right_on": "x"} + check1(exp_in, kwarg) + check2(exp_nan, kwarg) + + kwarg = {"left_index": True, "right_index": True} + check1(exp_in, kwarg) + check2(exp_out, kwarg) kwarg = {"left_on": "a", "right_index": True} check1(exp_in, kwarg) @@ -762,6 +774,7 @@ def test_other_datetime_unit(self, unit): "days": days, }, columns=["entity_id", "days"], + index=[101, 102], ) assert exp["days"].dtype == exp_dtype tm.assert_frame_equal(result, exp) @@ -789,6 +802,7 @@ def test_other_timedelta_unit(self, unit): exp = DataFrame( {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, columns=["entity_id", "days"], + index=[101, 102], ) tm.assert_frame_equal(result, exp) @@ -1190,7 +1204,7 @@ def test_validation(self): "c": ["meow", "bark", "um... weasel noise?", "nay"], }, columns=["b", "a", "c"], - index=range(4), + index=Index(["a", "b", "c", "d"], name="a"), ) left_index_reset = left.set_index("a") @@ -1331,48 +1345,17 @@ def test_merge_two_empty_df_no_division_error(self): @pytest.mark.parametrize("how", ["right", "outer"]) @pytest.mark.parametrize( - "index,expected_index", + "index", [ - ( - CategoricalIndex([1, 2, 4]), - CategoricalIndex([1, 2, 4, None, None, None]), - ), - ( - DatetimeIndex( - ["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]" - ), - DatetimeIndex( - ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT], - dtype="M8[ns]", - ), - ), - *[ - ( - Index([1, 2, 3], dtype=dtyp), - Index([1, 2, 3, None, None, None], dtype=np.float64), - ) - for dtyp in tm.ALL_REAL_NUMPY_DTYPES - ], - ( - IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), - IntervalIndex.from_tuples( - [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan] - ), - ), - ( - PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), - PeriodIndex( - ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT], - freq="D", - ), - ), - ( - TimedeltaIndex(["1D", "2D", "3D"]), - TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]), - ), + CategoricalIndex([1, 2, 4]), + DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]"), + *[Index([1, 2, 3], dtype=dtyp) for dtyp in tm.ALL_REAL_NUMPY_DTYPES], + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), + TimedeltaIndex(["1D", "2D", "3D"]), ], ) - def test_merge_on_index_with_more_values(self, how, index, expected_index): + def test_merge_on_index_with_more_values(self, how, index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element @@ -1390,20 +1373,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): ], columns=["a", "key", "b"], ) - expected.set_index(expected_index, inplace=True) tm.assert_frame_equal(result, expected) def test_merge_right_index_right(self): - # Note: the expected output here is probably incorrect. - # See https://github.com/pandas-dev/pandas/issues/17257 for more. - # We include this as a regression test for GH-24897. + # Regression test for GH-24897. left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) right = DataFrame({"b": [1, 2, 3]}) expected = DataFrame( {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], - index=[0, 1, 2, np.nan], + index=[0, 1, 1, 2], ) result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) @@ -1436,7 +1416,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): "key": Categorical(["a", "a", "b", "c"]), "b": [1, 1, 2, 3], }, - index=[0, 1, 2, np.nan], + index=Categorical(["a", "a", "b", "c"], categories=list("abc")), ) expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) @@ -2661,7 +2641,8 @@ def test_merge_right_left_index(): "z_x": ["foo", "foo"], "x_y": [1, 1], "z_y": ["foo", "foo"], - } + }, + index=[1, 1], ) tm.assert_frame_equal(result, expected) @@ -2670,7 +2651,7 @@ def test_merge_result_empty_index_and_on(): # GH#33814 df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"]) df2 = DataFrame({"b": [1]}).set_index(["b"]) - expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"]) + expected = DataFrame({"b": []}, dtype=np.int64).set_index(["b"]) result = merge(df1, df2, left_on=["b"], right_index=True) tm.assert_frame_equal(result, expected) @@ -2850,7 +2831,9 @@ def test_merge_multiindex_single_level(): data={"b": [100]}, index=MultiIndex.from_tuples([("A",), ("C",)], names=["col"]), ) - expected = DataFrame({"col": ["A", "B"], "b": [100, np.nan]}) + expected = DataFrame( + {"col": ["A", "B"], "b": [100, np.nan]}, index=Index([("A",), np.nan]) + ) result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) @@ -2957,6 +2940,9 @@ def test_merge_ea_int_and_float_numpy(): tm.assert_frame_equal(result, expected.astype("float64")) +from pandas.core.dtypes.missing import na_value_for_dtype + + def test_merge_arrow_string_index(any_string_dtype): # GH#54894 pytest.importorskip("pyarrow") @@ -2964,7 +2950,10 @@ def test_merge_arrow_string_index(any_string_dtype): right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) result = left.merge(right, left_on="a", right_index=True, how="left") expected = DataFrame( - {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1.0, np.nan]}, + ) + expected.index = Index(["a"], dtype=any_string_dtype).append( + Index([na_value_for_dtype(any_string_dtype)]) ) tm.assert_frame_equal(result, expected) @@ -3022,3 +3011,12 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +def test_merge_index(): + # GH 57291 + dfa = DataFrame(range(10), columns=["a"]) + dfb = DataFrame({"b": range(5), "key": [5 + x for x in range(5)]}) + + result = dfa.merge(dfb, left_index=True, right_on="key", how="left") + tm.assert_index_equal(result.index, dfa.index) From 8e9928214694051ecc425236542f7b6c2344d75b Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:30:56 -0500 Subject: [PATCH 3/3] Update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da0d85b7bb529..1d6f401b26e4e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -664,6 +664,7 @@ Reshaping - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) +- Bug in :meth:`DataFrame.merge` where merging on an index could return a dataframe with the wrong index. (:issue:`57291`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)