-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
BUG: Modifies pandas.merge to propagate flags and metadata from its inputs to its output. #62266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 15 commits
742a268
a12bdbd
d22b90a
41b3571
58b3c2a
11ebac8
24f4c8d
4381364
bba9a13
da1b0f4
0d52fff
9c7b9ed
9f68134
ff1aba5
8ece859
6d216fe
9b51d3e
f2abf1f
eca7671
dddc031
7304a48
15adcd7
1a8602d
c5f31ac
d9b52f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6096,10 +6096,16 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: | |
""" | ||
Propagate metadata from other to self. | ||
|
||
This is the default implementation. Subclasses may override this method to | ||
implement their own metadata handling. | ||
|
||
Parameters | ||
---------- | ||
other : the object from which to get the attributes that we are going | ||
to propagate | ||
to propagate. If ``other`` has an ``input_objs`` attribute, then | ||
this attribute must contain an iterable of objects, each with an | ||
``attrs`` attribute, in which case, each such ``attrs`` instance | ||
must be a dictionary that is equal to all of the others. | ||
|
||
method : str, optional | ||
A passed method name providing context on where ``__finalize__`` | ||
was called. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1129,12 +1129,17 @@ def _reindex_and_concat( | |
return result | ||
|
||
def get_result(self) -> DataFrame: | ||
""" | ||
Execute the merge. | ||
""" | ||
if self.indicator: | ||
self.left, self.right = self._indicator_pre_merge(self.left, self.right) | ||
|
||
join_index, left_indexer, right_indexer = self._get_join_info() | ||
|
||
result = self._reindex_and_concat(join_index, left_indexer, right_indexer) | ||
|
||
# Is this call to __finalize__ really necessary? | ||
|
||
result = result.__finalize__( | ||
types.SimpleNamespace(input_objs=[self.left, self.right]), | ||
method=self._merge_type, | ||
|
@@ -1147,6 +1152,8 @@ def get_result(self) -> DataFrame: | |
|
||
self._maybe_restore_index_levels(result) | ||
|
||
# __finalize is responsible for copying the metadata from the inputs to merge | ||
# to the result. | ||
|
||
return result.__finalize__( | ||
types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" | ||
) | ||
|
@@ -1167,6 +1174,14 @@ def _indicator_name(self) -> str | None: | |
def _indicator_pre_merge( | ||
self, left: DataFrame, right: DataFrame | ||
) -> tuple[DataFrame, DataFrame]: | ||
""" | ||
Add one indicator column to each of the left and right inputs to a | ||
merge operation. | ||
|
||
|
||
These columns are used to produce another column in the output of the | ||
merge, indicating for each row of the output whether it was produced | ||
using the left, right or both inputs. | ||
""" | ||
columns = left.columns.union(right.columns) | ||
|
||
for i in ["_left_indicator", "_right_indicator"]: | ||
|
@@ -1193,6 +1208,12 @@ def _indicator_pre_merge( | |
|
||
@final | ||
def _indicator_post_merge(self, result: DataFrame) -> DataFrame: | ||
""" | ||
Add an indicator column to the merge result. | ||
|
||
This column indicates for each row of the output whether it was produced using | ||
the left, right or both inputs. | ||
""" | ||
result["_left_indicator"] = result["_left_indicator"].fillna(0) | ||
result["_right_indicator"] = result["_right_indicator"].fillna(0) | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,13 +1,13 @@ | ||||||
""" | ||||||
An exhaustive list of pandas methods exercising NDFrame.__finalize__. | ||||||
""" | ||||||
"""An exhaustive list of pandas methods exercising NDFrame.__finalize__.""" | ||||||
|
||||||
|
||||||
import operator | ||||||
import re | ||||||
|
||||||
import numpy as np | ||||||
import pytest | ||||||
|
||||||
from pandas._typing import MergeHow | ||||||
|
||||||
import pandas as pd | ||||||
|
||||||
# TODO: | ||||||
|
@@ -148,14 +148,6 @@ | |||||
operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), | ||||||
), | ||||||
(pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)), | ||||||
pytest.param( | ||||||
( | ||||||
pd.DataFrame, | ||||||
frame_data, | ||||||
operator.methodcaller("merge", pd.DataFrame({"A": [1]})), | ||||||
), | ||||||
marks=not_implemented_mark, | ||||||
), | ||||||
(pd.DataFrame, frame_data, operator.methodcaller("round", 2)), | ||||||
(pd.DataFrame, frame_data, operator.methodcaller("corr")), | ||||||
pytest.param( | ||||||
|
@@ -371,8 +363,7 @@ def idfn(x): | |||||
m = xpr.search(str(x)) | ||||||
if m: | ||||||
return m.group(1) | ||||||
else: | ||||||
return str(x) | ||||||
Comment on lines
374
to
375
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like just a style preference? Can you revert this change. |
||||||
return str(x) | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1])) | ||||||
|
@@ -586,7 +577,8 @@ def test_datetime_property(attr): | |||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
"attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] | ||||||
"attr", | ||||||
["days", "seconds", "microseconds", "nanoseconds", "components"], | ||||||
|
||||||
) | ||||||
def test_timedelta_property(attr): | ||||||
s = pd.Series(pd.timedelta_range("2000", periods=4)) | ||||||
|
@@ -630,7 +622,8 @@ def test_categorical_accessor(method): | |||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
"obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] | ||||||
"obj", | ||||||
[pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], | ||||||
|
||||||
) | ||||||
@pytest.mark.parametrize( | ||||||
"method", | ||||||
|
@@ -649,7 +642,8 @@ def test_groupby_finalize(obj, method): | |||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
"obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] | ||||||
"obj", | ||||||
[pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], | ||||||
|
||||||
) | ||||||
@pytest.mark.parametrize( | ||||||
"method", | ||||||
|
@@ -675,3 +669,154 @@ def test_finalize_frame_series_name(): | |||||
df = pd.DataFrame({"name": [1, 2]}) | ||||||
result = pd.Series([1, 2]).__finalize__(df) | ||||||
assert result.name is None | ||||||
|
||||||
|
||||||
# ---------------------------------------------------------------------------- | ||||||
# Tests for merge | ||||||
|
# Tests for merge | |
# Reshaping |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remove docstrings from tests, we generally do not add these. Comments are fine if they are adding something that is not already in the code itself.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remove these arrange/act/assert comments.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this will check whether or not a deep copy is made. Recommend doing
assert result.attrs is not left.attrs
assert result.attrs["a"] is not left.attrs["a"]
instead. Also I recommend adding in right
as well for good measure.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of having module level variables, can you use indicator strings in the parametrization (e.g. "no_metadata", "has_metadata") and then do something like:
if left == "no_metadata":
left = pd.DataFrame(...)
else:
...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As mentioned, remove the note