Skip to content

Commit d385047

Browse files
committed
ENH: Support 'left_anti' and 'right_anti' joins in pd.merge
1 parent 72fd708 commit d385047

File tree

6 files changed

+355
-5
lines changed

6 files changed

+355
-5
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Other enhancements
3333
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3434
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3535
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
36+
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
3637
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
3738
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
3839
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).

pandas/_typing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,9 @@ def closed(self) -> bool:
442442
AnyAll = Literal["any", "all"]
443443

444444
# merge
445-
MergeHow = Literal["left", "right", "inner", "outer", "cross"]
445+
MergeHow = Literal[
446+
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
447+
]
446448
MergeValidate = Literal[
447449
"one_to_one",
448450
"1:1",

pandas/core/frame.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,8 @@
315315
----------%s
316316
right : DataFrame or named Series
317317
Object to merge with.
318-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
318+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
319+
default 'inner'
319320
Type of merge to be performed.
320321
321322
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -328,6 +329,10 @@
328329
join; preserve the order of the left keys.
329330
* cross: creates the cartesian product from both frames, preserves the order
330331
of the left keys.
332+
* left_anti: use only keys from left frame that are not in right frame, similar
333+
to SQL left anti join; preserve key order.
334+
* right_anti: use only keys from right frame that are not in left frame, similar
335+
to SQL right anti join; preserve key order.
331336
on : label or list
332337
Column or index level names to join on. These must be found in both
333338
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -10600,7 +10605,8 @@ def join(
1060010605
values given, the `other` DataFrame must have a MultiIndex. Can
1060110606
pass an array as the join key if it is not already contained in
1060210607
the calling DataFrame. Like an Excel VLOOKUP operation.
10603-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
10608+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
10609+
default 'left'
1060410610
How to handle the operation of the two objects.
1060510611
1060610612
* left: use calling frame's index (or column if on is specified)
@@ -10612,6 +10618,10 @@ def join(
1061210618
of the calling's one.
1061310619
* cross: creates the cartesian product from both frames, preserves the order
1061410620
of the left keys.
10621+
* left_anti: use set difference of calling frame's index and `other`'s
10622+
index.
10623+
* right_anti: use set difference of `other`'s index and calling frame's
10624+
index.
1061510625
lsuffix : str, default ''
1061610626
Suffix to use from left frame's overlapping columns.
1061710627
rsuffix : str, default ''

pandas/core/reshape/merge.py

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def merge(
180180
First pandas object to merge.
181181
right : DataFrame or named Series
182182
Second pandas object to merge.
183-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
183+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti},
184+
default 'inner'
184185
Type of merge to be performed.
185186
186187
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -193,6 +194,10 @@ def merge(
193194
join; preserve the order of the left keys.
194195
* cross: creates the cartesian product from both frames, preserves the order
195196
of the left keys.
197+
* left_anti: use only keys from left frame that are not in right frame, similar
198+
to SQL left anti join; preserve key order.
199+
* right_anti: use only keys from right frame that are not in left frame, similar
200+
to SQL right anti join; preserve key order.
196201
on : label or list
197202
Column or index level names to join on. These must be found in both
198203
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -969,6 +974,7 @@ def __init__(
969974
self.left = self.orig_left = _left
970975
self.right = self.orig_right = _right
971976
self.how = how
977+
self.anti_join = False
972978

973979
self.on = com.maybe_make_list(on)
974980

@@ -999,12 +1005,24 @@ def __init__(
9991005
raise MergeError(msg)
10001006

10011007
# GH 59435: raise when "how" is not a valid Merge type
1002-
merge_type = {"left", "right", "inner", "outer", "cross", "asof"}
1008+
merge_type = {
1009+
"left",
1010+
"right",
1011+
"inner",
1012+
"outer",
1013+
"left_anti",
1014+
"right_anti",
1015+
"cross",
1016+
"asof",
1017+
}
10031018
if how not in merge_type:
10041019
raise ValueError(
10051020
f"'{how}' is not a valid Merge type: "
10061021
f"left, right, inner, outer, cross, asof"
10071022
)
1023+
if self.how in {"left_anti", "right_anti"}:
1024+
self.how = self.how.split("_")[0]
1025+
self.anti_join = True
10081026

10091027
self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
10101028

@@ -1405,6 +1423,11 @@ def _get_join_info(
14051423
n = len(left_ax) if left_indexer is None else len(left_indexer)
14061424
join_index = default_index(n)
14071425

1426+
if self.anti_join:
1427+
join_index, left_indexer, right_indexer = self._handle_anti_join(
1428+
join_index, left_indexer, right_indexer
1429+
)
1430+
14081431
return join_index, left_indexer, right_indexer
14091432

14101433
@final
@@ -1447,6 +1470,48 @@ def _create_join_index(
14471470
return index.copy()
14481471
return index.take(indexer)
14491472

1473+
@final
1474+
def _handle_anti_join(
1475+
self,
1476+
join_index: Index,
1477+
left_indexer: npt.NDArray[np.intp] | None,
1478+
right_indexer: npt.NDArray[np.intp] | None,
1479+
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1480+
"""
1481+
Handle anti join by returning the correct join index and indexers
1482+
1483+
Parameters
1484+
----------
1485+
join_index : Index
1486+
join index
1487+
left_indexer : np.ndarray[np.intp] or None
1488+
left indexer
1489+
right_indexer : np.ndarray[np.intp] or None
1490+
right indexer
1491+
1492+
Returns
1493+
-------
1494+
Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None
1495+
"""
1496+
# Make sure indexers are not None
1497+
if left_indexer is None:
1498+
left_indexer = np.arange(len(self.left))
1499+
if right_indexer is None:
1500+
right_indexer = np.arange(len(self.right))
1501+
1502+
assert self.how in {"left", "right"}
1503+
if self.how == "left":
1504+
# Filter to rows where left keys are not in right keys
1505+
filt = right_indexer == -1
1506+
else:
1507+
# Filter to rows where right keys are not in left keys
1508+
filt = left_indexer == -1
1509+
join_index = join_index[filt]
1510+
left_indexer = left_indexer[filt]
1511+
right_indexer = right_indexer[filt]
1512+
1513+
return join_index, left_indexer, right_indexer
1514+
14501515
@final
14511516
def _get_merge_keys(
14521517
self,

pandas/tests/frame/methods/test_join.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,16 @@ def test_join_index(float_frame):
277277
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
278278
tm.assert_index_equal(joined.columns, expected_columns)
279279

280+
# left anti
281+
joined = f.join(f2, how="left_anti")
282+
tm.assert_index_equal(joined.index, float_frame.index[:5])
283+
tm.assert_index_equal(joined.columns, expected_columns)
284+
285+
# right anti
286+
joined = f.join(f2, how="right_anti")
287+
tm.assert_index_equal(joined.index, float_frame.index[10:][::-1])
288+
tm.assert_index_equal(joined.columns, expected_columns)
289+
280290
join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof"
281291
with pytest.raises(ValueError, match=re.escape(join_msg)):
282292
f.join(f2, how="foo")

0 commit comments

Comments
 (0)