Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
742a268
Activated test for metadata of merge operation.
aijams Sep 4, 2025
a12bdbd
Fixed error in test for merge result flags.
aijams Sep 4, 2025
d22b90a
Tested that merge collects metadata from only its left argument.
aijams Sep 4, 2025
41b3571
Added test to check whether merge_asof correctly sets the allow dupli…
aijams Sep 4, 2025
58b3c2a
Added bug fix description to documentation.
aijams Sep 5, 2025
11ebac8
Added test to check metadata handling for pandas.merge.
aijams Sep 9, 2025
24f4c8d
Added identifiers to test cases for pandas.merge.
aijams Sep 9, 2025
4381364
Modified tests for __finalize__ to respect documented merge behavior.
aijams Sep 11, 2025
bba9a13
Added type annotations to test method parameters.
aijams Sep 17, 2025
da1b0f4
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Sep 17, 2025
0d52fff
Fixed type issue for test_finalize. Added a little documentation.
aijams Sep 18, 2025
9c7b9ed
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Sep 18, 2025
9f68134
Ran ruff formatter to correct some issues.
aijams Sep 18, 2025
ff1aba5
Fixed some cosmetic issues with pre-commit hooks.
aijams Sep 19, 2025
8ece859
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Sep 19, 2025
6d216fe
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Oct 6, 2025
9b51d3e
Removed docstrings on tests.
aijams Oct 6, 2025
f2abf1f
Resolved several issues in test_finalize.
aijams Oct 7, 2025
eca7671
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Oct 7, 2025
dddc031
Fixed a couple nitpicks.
aijams Oct 7, 2025
7304a48
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Oct 8, 2025
15adcd7
Reformatted argument lists as required by ruff.
aijams Oct 8, 2025
1a8602d
Added note and removed potentially confusing docs from __fianlize__.
aijams Oct 8, 2025
c5f31ac
Merge remote-tracking branch 'upstream/main' into aijams-dataframe-me…
aijams Oct 9, 2025
d9b52f0
Removed trailing commas.
aijams Oct 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1094,6 +1094,7 @@ Reshaping
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)
- Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`)
- Bug in :meth:`DataFrame.merge` where the result of a merge does not contain any metadata or flag information from the inputs to the merge. (:issue:`28283`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned, remove the note

- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
- Bug in :meth:`DataFrame.merge` with :class:`CategoricalDtype` columns incorrectly raising ``RecursionError`` (:issue:`56376`)
- Bug in :meth:`DataFrame.merge` with a ``float32`` index incorrectly casting the index to ``float64`` (:issue:`41626`)
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6096,10 +6096,16 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
"""
Propagate metadata from other to self.

This is the default implementation. Subclasses may override this method to
implement their own metadata handling.

Parameters
----------
other : the object from which to get the attributes that we are going
to propagate
to propagate. If ``other`` has an ``input_objs`` attribute, then
this attribute must contain an iterable of objects, each with an
``attrs`` attribute, in which case, each such ``attrs`` instance
must be a dictionary that is equal to all of the others.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By saying "must be a dictionary that is equal to all of the others", it sounds like that is a requirement for calling this function but that is not the case. Rather, it just won't propagate when this requirement is not met.

method : str, optional
A passed method name providing context on where ``__finalize__``
was called.
Expand Down
21 changes: 21 additions & 0 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,12 +1129,17 @@ def _reindex_and_concat(
return result

def get_result(self) -> DataFrame:
"""
Execute the merge.
"""
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)

join_index, left_indexer, right_indexer = self._get_join_info()

result = self._reindex_and_concat(join_index, left_indexer, right_indexer)

# Is this call to __finalize__ really necessary?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I believe this can be removed.

result = result.__finalize__(
types.SimpleNamespace(input_objs=[self.left, self.right]),
method=self._merge_type,
Expand All @@ -1147,6 +1152,8 @@ def get_result(self) -> DataFrame:

self._maybe_restore_index_levels(result)

# __finalize is responsible for copying the metadata from the inputs to merge
# to the result.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove this comment, it does not add any information that isn't already documented in __finalize__ itself.

return result.__finalize__(
types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge"
)
Expand All @@ -1167,6 +1174,14 @@ def _indicator_name(self) -> str | None:
def _indicator_pre_merge(
self, left: DataFrame, right: DataFrame
) -> tuple[DataFrame, DataFrame]:
"""
Add one indicator column to each of the left and right inputs to a
merge operation.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: short summary should be 1 line. I think you can just remove to a merge operation


These columns are used to produce another column in the output of the
merge, indicating for each row of the output whether it was produced
using the left, right or both inputs.
"""
columns = left.columns.union(right.columns)

for i in ["_left_indicator", "_right_indicator"]:
Expand All @@ -1193,6 +1208,12 @@ def _indicator_pre_merge(

@final
def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
"""
Add an indicator column to the merge result.

This column indicates for each row of the output whether it was produced using
the left, right or both inputs.
"""
result["_left_indicator"] = result["_left_indicator"].fillna(0)
result["_right_indicator"] = result["_right_indicator"].fillna(0)

Expand Down
177 changes: 161 additions & 16 deletions pandas/tests/generic/test_finalize.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""
An exhaustive list of pandas methods exercising NDFrame.__finalize__.
"""
"""An exhaustive list of pandas methods exercising NDFrame.__finalize__."""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


import operator
import re

import numpy as np
import pytest

from pandas._typing import MergeHow

import pandas as pd

# TODO:
Expand Down Expand Up @@ -148,14 +148,6 @@
operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]),
),
(pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)),
pytest.param(
(
pd.DataFrame,
frame_data,
operator.methodcaller("merge", pd.DataFrame({"A": [1]})),
),
marks=not_implemented_mark,
),
(pd.DataFrame, frame_data, operator.methodcaller("round", 2)),
(pd.DataFrame, frame_data, operator.methodcaller("corr")),
pytest.param(
Expand Down Expand Up @@ -371,8 +363,7 @@ def idfn(x):
m = xpr.search(str(x))
if m:
return m.group(1)
else:
return str(x)
Comment on lines 374 to 375
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like just a style preference? Can you revert this change.

return str(x)


@pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1]))
Expand Down Expand Up @@ -586,7 +577,8 @@ def test_datetime_property(attr):


@pytest.mark.parametrize(
"attr", ["days", "seconds", "microseconds", "nanoseconds", "components"]
"attr",
["days", "seconds", "microseconds", "nanoseconds", "components"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you revert

)
def test_timedelta_property(attr):
s = pd.Series(pd.timedelta_range("2000", periods=4))
Expand Down Expand Up @@ -630,7 +622,8 @@ def test_categorical_accessor(method):


@pytest.mark.parametrize(
"obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})]
"obj",
[pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you revert

)
@pytest.mark.parametrize(
"method",
Expand All @@ -649,7 +642,8 @@ def test_groupby_finalize(obj, method):


@pytest.mark.parametrize(
"obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})]
"obj",
[pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you revert

)
@pytest.mark.parametrize(
"method",
Expand All @@ -675,3 +669,154 @@ def test_finalize_frame_series_name():
df = pd.DataFrame({"name": [1, 2]})
result = pd.Series([1, 2]).__finalize__(df)
assert result.name is None


# ----------------------------------------------------------------------------
# Tests for merge
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should be consistent with the other sections.

Suggested change
# Tests for merge
# Reshaping



@pytest.mark.parametrize(
["allow_on_left", "allow_on_right"],
[(False, False), (False, True), (True, False), (True, True)],
)
@pytest.mark.parametrize(
"how",
[
"left",
"right",
"inner",
"outer",
"left_anti",
"right_anti",
"cross",
],
)
def test_merge_sets_duplication_allowance_flag(
how: MergeHow,
allow_on_left: bool,
allow_on_right: bool,
):
"""Check that DataFrame.merge correctly sets the allow_duplicate_labels flag
on its result.

The flag on the result should be set to true if and only if both arguments
to merge have their flags set to True.
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove docstrings from tests, we generally do not add these. Comments are fine if they are adding something that is not already in the code itself.

# Arrange
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove these arrange/act/assert comments.

left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left)
right = pd.DataFrame({"test": [1]}).set_flags(
allows_duplicate_labels=allow_on_right,
)

# Act
if not how == "cross":
result = left.merge(right, how=how, on="test")
else:
result = left.merge(right, how=how)

# Assert
expected_duplication_allowance = allow_on_left and allow_on_right
assert result.flags.allows_duplicate_labels == expected_duplication_allowance


@pytest.mark.parametrize(
["allow_on_left", "allow_on_right"],
[(False, False), (False, True), (True, False), (True, True)],
)
def test_merge_asof_sets_duplication_allowance_flag(
allow_on_left: bool,
allow_on_right: bool,
):
"""Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag
on its result.

The flag on the result should be set to true if and only if both arguments
to merge_asof have their flags set to True.
"""
# Arrange
left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left)
right = pd.DataFrame({"test": [1]}).set_flags(
allows_duplicate_labels=allow_on_right,
)

# Act
result = pd.merge_asof(left, right)

# Assert
expected_duplication_allowance = allow_on_left and allow_on_right
assert result.flags.allows_duplicate_labels == expected_duplication_allowance


def test_merge_propagates_metadata_from_equal_input_metadata():
"""Check that pandas.merge sets the metadata of its result to a deep copy of
the metadata from its left input, if the metadata from both inputs are equal.
"""
# Arrange
metadata = {"a": 2}
left = pd.DataFrame({"test": [1]})
left.attrs = metadata
right = pd.DataFrame({"test": [1]})
right.attrs = metadata.copy()

# Act
result = left.merge(right, how="inner", on="test")

# Assert
assert result.attrs == metadata
left.attrs = {"b": 3}
assert result.attrs == metadata
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will check whether or not a deep copy is made. Recommend doing

assert result.attrs is not left.attrs
assert result.attrs["a"] is not left.attrs["a"]

instead. Also I recommend adding in right as well for good measure.



def test_merge_does_not_propagate_metadata_from_unequal_input_metadata():
"""Check that the metadata for the result of pandas.merge is empty if the
metadata for both inputs to pandas.merge are not equal.
"""
# Arrange
left = pd.DataFrame({"test": [1]})
left.attrs = {"a": 2}
right = pd.DataFrame({"test": [1]})
right.attrs = {"b": 3}

# Act
result = left.merge(right, how="inner", on="test")

# Assert
assert result.attrs == {}


no_metadata = pd.DataFrame({"test": [1]})

has_metadata = pd.DataFrame({"test": [1]})
has_metadata.attrs = {"a": 2}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having module level variables, can you use indicator strings in the parametrization (e.g. "no_metadata", "has_metadata") and then do something like:

if left == "no_metadata":
     left = pd.DataFrame(...)
else:
    ...



@pytest.mark.parametrize(
["left", "right", "expected"],
[
(no_metadata, has_metadata, {}),
(has_metadata, no_metadata, {}),
(no_metadata, no_metadata, {}),
],
ids=["left-empty", "right-empty", "both-empty"],
)
def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata(
left: pd.DataFrame,
right: pd.DataFrame,
expected: dict,
):
"""Check that if the metadata for one input to pandas.merge is empty, the result
of merge has the same metadata as the other input.

(empty) (A) (A) (empty) (empty) (empty)
| | | | | |
--> merge <-- --> merge <-- --> merge <--
| | |
(empty) (empty) (empty)
"""
# Arrange

# Act
result = left.merge(right, how="inner", on="test")

# Assert
assert result.attrs == expected
8 changes: 7 additions & 1 deletion pandas/tests/generic/test_frame.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from copy import deepcopy
from operator import methodcaller
from typing import Literal

import numpy as np
import pytest
Expand Down Expand Up @@ -77,7 +78,12 @@ def test_metadata_propagation_indiv(self, monkeypatch):
# merging with override
# GH 6923

def finalize(self, other, method=None, **kwargs):
def finalize(
self: DataFrame,
other: DataFrame,
method: Literal["merge", "concat"] | None = None,
**kwargs,
):
for name in self._metadata:
if method == "merge":
left, right = other.input_objs
Expand Down
Loading