diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0154087b18399..50fc7c739d36f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6102,10 +6102,15 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: """ Propagate metadata from other to self. + This is the default implementation. Subclasses may override this method to + implement their own metadata handling. + Parameters ---------- other : the object from which to get the attributes that we are going - to propagate + to propagate. If ``other`` has an ``input_objs`` attribute, then + this attribute must contain an iterable of objects, each with an + ``attrs`` attribute. method : str, optional A passed method name providing context on where ``__finalize__`` was called. @@ -6114,6 +6119,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: The value passed as `method` are not currently considered stable across pandas releases. + + Notes + ----- + In case ``other`` has an ``input_objs`` attribute, this method only + propagates its metadata if each object in ``input_objs`` has the exact + same metadata as the others. """ if isinstance(other, NDFrame): if other.attrs: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f40a4b5d60ecd..604181214ad44 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1129,16 +1129,15 @@ def _reindex_and_concat( return result def get_result(self) -> DataFrame: + """ + Execute the merge. + """ if self.indicator: self.left, self.right = self._indicator_pre_merge(self.left, self.right) join_index, left_indexer, right_indexer = self._get_join_info() result = self._reindex_and_concat(join_index, left_indexer, right_indexer) - result = result.__finalize__( - types.SimpleNamespace(input_objs=[self.left, self.right]), - method=self._merge_type, - ) if self.indicator: result = self._indicator_post_merge(result) @@ -1167,6 +1166,13 @@ def _indicator_name(self) -> str | None: def _indicator_pre_merge( self, left: DataFrame, right: DataFrame ) -> tuple[DataFrame, DataFrame]: + """ + Add one indicator column to each of the left and right inputs. + + These columns are used to produce another column in the output of the + merge, indicating for each row of the output whether it was produced + using the left, right or both inputs. + """ columns = left.columns.union(right.columns) for i in ["_left_indicator", "_right_indicator"]: @@ -1193,6 +1199,12 @@ def _indicator_pre_merge( @final def _indicator_post_merge(self, result: DataFrame) -> DataFrame: + """ + Add an indicator column to the merge result. + + This column indicates for each row of the output whether it was produced using + the left, right or both inputs. + """ result["_left_indicator"] = result["_left_indicator"].fillna(0) result["_right_indicator"] = result["_right_indicator"].fillna(0) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4b841b54c488b..641d9518adb9a 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -2,12 +2,15 @@ An exhaustive list of pandas methods exercising NDFrame.__finalize__. """ +from copy import deepcopy import operator import re import numpy as np import pytest +from pandas._typing import MergeHow + import pandas as pd # TODO: @@ -148,14 +151,6 @@ operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), ), (pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)), - pytest.param( - ( - pd.DataFrame, - frame_data, - operator.methodcaller("merge", pd.DataFrame({"A": [1]})), - ), - marks=not_implemented_mark, - ), (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), (pd.DataFrame, frame_data, operator.methodcaller("corr")), pytest.param( @@ -675,3 +670,122 @@ def test_finalize_frame_series_name(): df = pd.DataFrame({"name": [1, 2]}) result = pd.Series([1, 2]).__finalize__(df) assert result.name is None + + +# ---------------------------------------------------------------------------- +# Merge + + +@pytest.mark.parametrize( + ["allow_on_left", "allow_on_right"], + [(False, False), (False, True), (True, False), (True, True)], +) +@pytest.mark.parametrize( + "how", + [ + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + ], +) +def test_merge_correctly_sets_duplication_allowance_flag( + how: MergeHow, + allow_on_left: bool, + allow_on_right: bool, +): + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) + right = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_right, + ) + + if not how == "cross": + result = left.merge(right, how=how, on="test") + else: + result = left.merge(right, how=how) + + expected_duplication_allowance = allow_on_left and allow_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance + + +@pytest.mark.parametrize( + ["allow_on_left", "allow_on_right"], + [(False, False), (False, True), (True, False), (True, True)], +) +def test_merge_asof_correctly_sets_duplication_allowance_flag( + allow_on_left: bool, + allow_on_right: bool, +): + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) + right = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_right, + ) + + result = pd.merge_asof(left, right) + + expected_duplication_allowance = allow_on_left and allow_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance + + +def test_merge_propagates_metadata_from_equal_input_metadata(): + metadata = {"a": [1, 2]} + left = pd.DataFrame({"test": [1]}) + left.attrs = metadata + right = pd.DataFrame({"test": [1]}) + right.attrs = deepcopy(metadata) + + result = left.merge(right, how="inner", on="test") + + assert result.attrs == metadata + + # Verify that merge deep-copies the attr dictionary. + assert result.attrs is not left.attrs + assert result.attrs is not right.attrs + assert result.attrs["a"] is not left.attrs["a"] + assert result.attrs["a"] is not right.attrs["a"] + + +def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): + left = pd.DataFrame({"test": [1]}) + left.attrs = {"a": 2} + right = pd.DataFrame({"test": [1]}) + right.attrs = {"b": 3} + + result = left.merge(right, how="inner", on="test") + + assert result.attrs == {} + + +@pytest.mark.parametrize( + ["left_has_metadata", "right_has_metadata", "expected"], + [ + (False, True, {}), + (True, False, {}), + (False, False, {}), + ], + ids=["left-empty", "right-empty", "both-empty"], +) +def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( + left_has_metadata: bool, + right_has_metadata: bool, + expected: dict, +): + left = pd.DataFrame({"test": [1]}) + right = pd.DataFrame({"test": [1]}) + + if left_has_metadata: + left.attrs = {"a": [1, 2]} + else: + left.attrs = {} + + if right_has_metadata: + right.attrs = {"a": [1, 2]} + else: + right.attrs = {} + + result = left.merge(right, how="inner", on="test") + + assert result.attrs == expected diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index e927c17eceb76..c2d24cceeab0c 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -1,5 +1,6 @@ from copy import deepcopy from operator import methodcaller +from typing import Literal import numpy as np import pytest @@ -77,7 +78,12 @@ def test_metadata_propagation_indiv(self, monkeypatch): # merging with override # GH 6923 - def finalize(self, other, method=None, **kwargs): + def finalize( + self: DataFrame, + other: DataFrame, + method: Literal["merge", "concat"] | None = None, + **kwargs, + ): for name in self._metadata: if method == "merge": left, right = other.input_objs