diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8053c17437c5e..0ac4d5fe8d26b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11110,7 +11110,7 @@ def merge( from pandas.core.reshape.merge import merge - return merge( + result = merge( self, right, how=how, @@ -11124,6 +11124,8 @@ def merge( indicator=indicator, validate=validate, ) + # ADDED: Apply __finalize__ to propagate metadata from left DataFrame + return result.__finalize__(self, method="merge") def round( self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs @@ -11211,6 +11213,18 @@ def round( 1 0.0 1.0 2 0.7 0.0 3 0.2 0.0 + + >>> df1 = pd.DataFrame({"key": [1, 2], "A": [1, 2]}) + >>> df2 = pd.DataFrame({"key": [1, 2], "B": [3, 4]}) + >>> df1.attrs["source"] = "dataset1" + >>> result = df1.merge(df2, on="key") + >>> result.attrs["source"] # Metadata is preserved + 'dataset1' + + Note + ---- + The merge operation propagates metadata (attrs, flags) from the left DataFrame + to the result using the __finalize__ method. """ from pandas.core.reshape.concat import concat diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 34f3e2c626378..8aa473c4a7424 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -370,7 +370,7 @@ def merge( left._check_copy_deprecation(copy) right_df = _validate_operand(right) if how == "cross": - return _cross_merge( + result = _cross_merge( left_df, right_df, on=on, @@ -398,7 +398,14 @@ def merge( indicator=indicator, validate=validate, ) - return op.get_result() + result = op.get_result() + + # ADDED: Apply __finalize__ to propagate metadata + # Use left DataFrame as the primary source for metadata + if hasattr(left, "__finalize__"): + result = result.__finalize__(left, method="merge") + + return result def _cross_merge( @@ -927,7 +934,12 @@ def merge_asof( allow_exact_matches=allow_exact_matches, direction=direction, ) - return op.get_result() + result = op.get_result() + # ADDED: Apply __finalize__ to propagate metadata + if hasattr(left, "__finalize__"): + result = result.__finalize__(left, method="merge_asof") + + return result # TODO: transformations?? @@ -1143,7 +1155,9 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) - return result.__finalize__(self, method="merge") + # NOTE: __finalize__ is now called in the higher-level merge functions + # rather than here, to ensure it's called consistently across all entry points + return result @final @cache_readonly diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4b841b54c488b..8bd02b4919cf1 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -13,7 +13,7 @@ # TODO: # * Binary methods (mul, div, etc.) # * Binary outputs (align, etc.) -# * top-level methods (concat, merge, get_dummies, etc.) +# * top-level methods (concat, get_dummies, etc.) # * window # * cumulative reductions @@ -154,7 +154,7 @@ frame_data, operator.methodcaller("merge", pd.DataFrame({"A": [1]})), ), - marks=not_implemented_mark, + # marks=not_implemented_mark, ), (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), (pd.DataFrame, frame_data, operator.methodcaller("corr")), @@ -675,3 +675,71 @@ def test_finalize_frame_series_name(): df = pd.DataFrame({"name": [1, 2]}) result = pd.Series([1, 2]).__finalize__(df) assert result.name is None + + +def test_merge_finalize(): + """Test that DataFrame.merge calls __finalize__.""" + # Create test DataFrames + df1 = pd.DataFrame({"key": [1, 2, 3], "A": [1, 2, 3]}) + df2 = pd.DataFrame({"key": [1, 2, 4], "B": [4, 5, 6]}) + + # Add metadata + df1.attrs["source"] = "left" + df1.attrs["version"] = "1.0" + + # Test different merge types + for how in ["inner", "outer", "left", "right"]: + result = df1.merge(df2, on="key", how=how) + + # Check that attrs were propagated from left DataFrame + assert result.attrs["source"] == "left" + assert result.attrs["version"] == "1.0" + + +def test_merge_asof_finalize(): + """Test that merge_asof calls __finalize__.""" + df1 = pd.DataFrame({"time": [1, 2, 3], "A": [1, 2, 3]}) + df2 = pd.DataFrame({"time": [1, 2, 4], "B": [4, 5, 6]}) + + df1.attrs["source"] = "quotes" + + result = pd.merge_asof(df1, df2, on="time") + + # Check that attrs were propagated + assert result.attrs["source"] == "quotes" + + +def test_merge_index_finalize(): + """Test that index-based merge calls __finalize__.""" + df1 = pd.DataFrame({"A": [1, 2]}, index=[1, 2]) + df2 = pd.DataFrame({"B": [3, 4]}, index=[1, 2]) + + df1.attrs["index_merge"] = True + + result = df1.merge(df2, left_index=True, right_index=True) + + assert result.attrs["index_merge"] is True + + +def test_merge_suffixes_finalize(): + """Test merge with suffixes calls __finalize__.""" + df1 = pd.DataFrame({"key": [1, 2], "value": [1, 2]}) + df2 = pd.DataFrame({"key": [1, 2], "value": [3, 4]}) + + df1.attrs["has_suffixes"] = True + + result = df1.merge(df2, on="key", suffixes=("_left", "_right")) + + assert result.attrs["has_suffixes"] is True + + +def test_merge_series_finalize(): + """Test that merging with a Series calls __finalize__.""" + df = pd.DataFrame({"key": [1, 2, 3], "A": [1, 2, 3]}) + s = pd.Series([4, 5, 6], index=[1, 2, 3], name="B") + + df.attrs["merged_with_series"] = True + + result = df.merge(s, left_on="key", right_index=True) + + assert result.attrs["merged_with_series"] is True