Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Other enhancements
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
Expand Down
4 changes: 3 additions & 1 deletion pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,9 @@ def closed(self) -> bool:
AnyAll = Literal["any", "all"]

# merge
MergeHow = Literal["left", "right", "inner", "outer", "cross"]
MergeHow = Literal[
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
]
MergeValidate = Literal[
"one_to_one",
"1:1",
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,8 @@
----------%s
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
default 'inner'
Type of merge to be performed.

* left: use only keys from left frame, similar to a SQL left outer join;
Expand All @@ -328,6 +329,10 @@
join; preserve the order of the left keys.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use only keys from left frame that are not in right frame, similar
to SQL left anti join; preserve key order.
* right_anti: use only keys from right frame that are not in left frame, similar
to SQL right anti join; preserve key order.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
Expand Down Expand Up @@ -10600,7 +10605,8 @@ def join(
values given, the `other` DataFrame must have a MultiIndex. Can
pass an array as the join key if it is not already contained in
the calling DataFrame. Like an Excel VLOOKUP operation.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
default 'left'
How to handle the operation of the two objects.

* left: use calling frame's index (or column if on is specified)
Expand All @@ -10612,6 +10618,10 @@ def join(
of the calling's one.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use set difference of calling frame's index and `other`'s
index.
* right_anti: use set difference of `other`'s index and calling frame's
index.
lsuffix : str, default ''
Suffix to use from left frame's overlapping columns.
rsuffix : str, default ''
Expand Down
97 changes: 86 additions & 11 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ def merge(
First pandas object to merge.
right : DataFrame or named Series
Second pandas object to merge.
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti},
default 'inner'
Type of merge to be performed.

* left: use only keys from left frame, similar to a SQL left outer join;
Expand All @@ -193,6 +194,10 @@ def merge(
join; preserve the order of the left keys.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use only keys from left frame that are not in right frame, similar
to SQL left anti join; preserve key order.
* right_anti: use only keys from right frame that are not in left frame, similar
to SQL right anti join; preserve key order.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
Expand Down Expand Up @@ -953,7 +958,7 @@ def __init__(
self,
left: DataFrame | Series,
right: DataFrame | Series,
how: JoinHow | Literal["asof"] = "inner",
how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner",
on: IndexLabel | AnyArrayLike | None = None,
left_on: IndexLabel | AnyArrayLike | None = None,
right_on: IndexLabel | AnyArrayLike | None = None,
Expand All @@ -968,7 +973,7 @@ def __init__(
_right = _validate_operand(right)
self.left = self.orig_left = _left
self.right = self.orig_right = _right
self.how = how
self.how, self.anti_join = self._validate_how(how)

self.on = com.maybe_make_list(on)

Expand Down Expand Up @@ -998,14 +1003,6 @@ def __init__(
)
raise MergeError(msg)

# GH 59435: raise when "how" is not a valid Merge type
merge_type = {"left", "right", "inner", "outer", "cross", "asof"}
if how not in merge_type:
raise ValueError(
f"'{how}' is not a valid Merge type: "
f"left, right, inner, outer, cross, asof"
)

self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

(
Expand Down Expand Up @@ -1035,6 +1032,37 @@ def __init__(
if validate is not None:
self._validate_validate_kwd(validate)

@final
def _validate_how(
self, how: JoinHow | Literal["left_anti", "right_anti", "asof"]
) -> tuple[JoinHow | Literal["asof"], bool]:
"""
Validate the 'how' parameter and return the actual join type and whether
this is an anti join.
"""
# GH 59435: raise when "how" is not a valid Merge type
merge_type = {
"left",
"right",
"inner",
"outer",
"left_anti",
"right_anti",
"cross",
"asof",
}
if how not in merge_type:
raise ValueError(
f"'{how}' is not a valid Merge type: "
f"left, right, inner, outer, left_anti, right_anti, cross, asof"
)
anti_join = False
if how in {"left_anti", "right_anti"}:
how = how.split("_")[0] # type: ignore[assignment]
anti_join = True
how = cast(JoinHow | Literal["asof"], how)
return how, anti_join

def _maybe_require_matching_dtypes(
self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]
) -> None:
Expand Down Expand Up @@ -1405,6 +1433,11 @@ def _get_join_info(
n = len(left_ax) if left_indexer is None else len(left_indexer)
join_index = default_index(n)

if self.anti_join:
join_index, left_indexer, right_indexer = self._handle_anti_join(
join_index, left_indexer, right_indexer
)

return join_index, left_indexer, right_indexer

@final
Expand Down Expand Up @@ -1447,6 +1480,48 @@ def _create_join_index(
return index.copy()
return index.take(indexer)

@final
def _handle_anti_join(
self,
join_index: Index,
left_indexer: npt.NDArray[np.intp] | None,
right_indexer: npt.NDArray[np.intp] | None,
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
"""
Handle anti join by returning the correct join index and indexers

Parameters
----------
join_index : Index
join index
left_indexer : np.ndarray[np.intp] or None
left indexer
right_indexer : np.ndarray[np.intp] or None
right indexer

Returns
-------
Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None
"""
# Make sure indexers are not None
if left_indexer is None:
left_indexer = np.arange(len(self.left))
if right_indexer is None:
right_indexer = np.arange(len(self.right))

assert self.how in {"left", "right"}
if self.how == "left":
# Filter to rows where left keys are not in right keys
filt = right_indexer == -1
else:
# Filter to rows where right keys are not in left keys
filt = left_indexer == -1
join_index = join_index[filt]
left_indexer = left_indexer[filt]
right_indexer = right_indexer[filt]

return join_index, left_indexer, right_indexer

@final
def _get_merge_keys(
self,
Expand Down
15 changes: 14 additions & 1 deletion pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,20 @@ def test_join_index(float_frame):
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
tm.assert_index_equal(joined.columns, expected_columns)

join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof"
# left anti
joined = f.join(f2, how="left_anti")
tm.assert_index_equal(joined.index, float_frame.index[:5])
tm.assert_index_equal(joined.columns, expected_columns)

# right anti
joined = f.join(f2, how="right_anti")
tm.assert_index_equal(joined.index, float_frame.index[10:][::-1])
tm.assert_index_equal(joined.columns, expected_columns)

join_msg = (
"'foo' is not a valid Merge type: left, right, inner, outer, "
"left_anti, right_anti, cross, asof"
)
with pytest.raises(ValueError, match=re.escape(join_msg)):
f.join(f2, how="foo")

Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,7 +1464,10 @@ def test_merge_how_validation(self):
data2 = DataFrame(
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
)
msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof"
msg = (
"'full' is not a valid Merge type: left, right, inner, outer, "
"left_anti, right_anti, cross, asof"
)
with pytest.raises(ValueError, match=re.escape(msg)):
data1.merge(data2, how="full")

Expand Down
Loading
Loading