Skip to content

ENH: Support 'left_anti' and 'right_anti' joins in pd.merge #60732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Other enhancements
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
Expand Down
4 changes: 3 additions & 1 deletion pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,9 @@ def closed(self) -> bool:
AnyAll = Literal["any", "all"]

# merge
MergeHow = Literal["left", "right", "inner", "outer", "cross"]
MergeHow = Literal[
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
]
MergeValidate = Literal[
"one_to_one",
"1:1",
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,8 @@
----------%s
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
default 'inner'
Type of merge to be performed.

* left: use only keys from left frame, similar to a SQL left outer join;
Expand All @@ -328,6 +329,10 @@
join; preserve the order of the left keys.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use only keys from left frame that are not in right frame, similar
to SQL left anti join; preserve key order.
* right_anti: use only keys from right frame that are not in left frame, similar
to SQL right anti join; preserve key order.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
Expand Down Expand Up @@ -10600,7 +10605,8 @@ def join(
values given, the `other` DataFrame must have a MultiIndex. Can
pass an array as the join key if it is not already contained in
the calling DataFrame. Like an Excel VLOOKUP operation.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
default 'left'
How to handle the operation of the two objects.

* left: use calling frame's index (or column if on is specified)
Expand All @@ -10612,6 +10618,10 @@ def join(
of the calling's one.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use set difference of calling frame's index and `other`'s
index.
* right_anti: use set difference of `other`'s index and calling frame's
index.
lsuffix : str, default ''
Suffix to use from left frame's overlapping columns.
rsuffix : str, default ''
Expand Down
97 changes: 86 additions & 11 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ def merge(
First pandas object to merge.
right : DataFrame or named Series
Second pandas object to merge.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti},
default 'inner'
Type of merge to be performed.

* left: use only keys from left frame, similar to a SQL left outer join;
Expand All @@ -193,6 +194,10 @@ def merge(
join; preserve the order of the left keys.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use only keys from left frame that are not in right frame, similar
to SQL left anti join; preserve key order.
* right_anti: use only keys from right frame that are not in left frame, similar
to SQL right anti join; preserve key order.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
Expand Down Expand Up @@ -953,7 +958,7 @@ def __init__(
self,
left: DataFrame | Series,
right: DataFrame | Series,
how: JoinHow | Literal["asof"] = "inner",
how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner",
on: IndexLabel | AnyArrayLike | None = None,
left_on: IndexLabel | AnyArrayLike | None = None,
right_on: IndexLabel | AnyArrayLike | None = None,
Expand All @@ -968,7 +973,7 @@ def __init__(
_right = _validate_operand(right)
self.left = self.orig_left = _left
self.right = self.orig_right = _right
self.how = how
self.how, self.anti_join = self._validate_how(how)

self.on = com.maybe_make_list(on)

Expand Down Expand Up @@ -998,14 +1003,6 @@ def __init__(
)
raise MergeError(msg)

# GH 59435: raise when "how" is not a valid Merge type
merge_type = {"left", "right", "inner", "outer", "cross", "asof"}
if how not in merge_type:
raise ValueError(
f"'{how}' is not a valid Merge type: "
f"left, right, inner, outer, cross, asof"
)

self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

(
Expand Down Expand Up @@ -1035,6 +1032,37 @@ def __init__(
if validate is not None:
self._validate_validate_kwd(validate)

@final
def _validate_how(
self, how: JoinHow | Literal["left_anti", "right_anti", "asof"]
) -> tuple[JoinHow | Literal["asof"], bool]:
"""
Validate the 'how' parameter and return the actual join type and whether
this is an anti join.
"""
# GH 59435: raise when "how" is not a valid Merge type
merge_type = {
"left",
"right",
"inner",
"outer",
"left_anti",
"right_anti",
"cross",
"asof",
}
if how not in merge_type:
raise ValueError(
f"'{how}' is not a valid Merge type: "
f"left, right, inner, outer, left_anti, right_anti, cross, asof"
)
anti_join = False
if how in {"left_anti", "right_anti"}:
how = how.split("_")[0] # type: ignore[assignment]
anti_join = True
how = cast(JoinHow | Literal["asof"], how)
return how, anti_join

def _maybe_require_matching_dtypes(
self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]
) -> None:
Expand Down Expand Up @@ -1405,6 +1433,11 @@ def _get_join_info(
n = len(left_ax) if left_indexer is None else len(left_indexer)
join_index = default_index(n)

if self.anti_join:
join_index, left_indexer, right_indexer = self._handle_anti_join(
join_index, left_indexer, right_indexer
)

return join_index, left_indexer, right_indexer

@final
Expand Down Expand Up @@ -1447,6 +1480,48 @@ def _create_join_index(
return index.copy()
return index.take(indexer)

@final
def _handle_anti_join(
self,
join_index: Index,
left_indexer: npt.NDArray[np.intp] | None,
right_indexer: npt.NDArray[np.intp] | None,
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
"""
Handle anti join by returning the correct join index and indexers

Parameters
----------
join_index : Index
join index
left_indexer : np.ndarray[np.intp] or None
left indexer
right_indexer : np.ndarray[np.intp] or None
right indexer

Returns
-------
Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None
"""
# Make sure indexers are not None
if left_indexer is None:
left_indexer = np.arange(len(self.left))
if right_indexer is None:
right_indexer = np.arange(len(self.right))

assert self.how in {"left", "right"}
if self.how == "left":
# Filter to rows where left keys are not in right keys
filt = right_indexer == -1
else:
# Filter to rows where right keys are not in left keys
filt = left_indexer == -1
join_index = join_index[filt]
left_indexer = left_indexer[filt]
right_indexer = right_indexer[filt]

return join_index, left_indexer, right_indexer

@final
def _get_merge_keys(
self,
Expand Down
15 changes: 14 additions & 1 deletion pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,20 @@ def test_join_index(float_frame):
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
tm.assert_index_equal(joined.columns, expected_columns)

join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof"
# left anti
joined = f.join(f2, how="left_anti")
tm.assert_index_equal(joined.index, float_frame.index[:5])
tm.assert_index_equal(joined.columns, expected_columns)

# right anti
joined = f.join(f2, how="right_anti")
tm.assert_index_equal(joined.index, float_frame.index[10:][::-1])
tm.assert_index_equal(joined.columns, expected_columns)

join_msg = (
"'foo' is not a valid Merge type: left, right, inner, outer, "
"left_anti, right_anti, cross, asof"
)
with pytest.raises(ValueError, match=re.escape(join_msg)):
f.join(f2, how="foo")

Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,7 +1464,10 @@ def test_merge_how_validation(self):
data2 = DataFrame(
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
)
msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof"
msg = (
"'full' is not a valid Merge type: left, right, inner, outer, "
"left_anti, right_anti, cross, asof"
)
with pytest.raises(ValueError, match=re.escape(msg)):
data1.merge(data2, how="full")

Expand Down
Loading
Loading