diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 0000000000000..f89a028634167 --- /dev/null +++ b/.gitconfig @@ -0,0 +1,7 @@ +[user] + email = test@example.com + name = test +[pull] + rebase = false +[push] + default = simple diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 604181214ad44..b6d495272a3d0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1963,34 +1963,65 @@ def _validate_validate_kwd(self, validate: str) -> None: else: right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + def _get_dupes_msg(keys: list, side: str) -> str: + # Helper to get a message for duplicate keys + keys_mi = MultiIndex.from_arrays(keys) + dupes = keys_mi[keys_mi.duplicated()].unique() + + # PR description says to cut off after a few values + max_dupes_to_show = 5 + if len(dupes) > max_dupes_to_show: + dupes_to_show = dupes[:max_dupes_to_show] + extra_msg = f", showing first {max_dupes_to_show}" + else: + dupes_to_show = dupes + extra_msg = "" + + # Show tuples for MultiIndex, but single values for single key + if keys_mi.nlevels == 1: + dupes_str = ", ".join(map(str, dupes_to_show.get_level_values(0))) + else: + dupes_str = ", ".join(map(str, list(dupes_to_show))) + + return f". Duplicates in {side} key(s): [{dupes_str}]{extra_msg}." + # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: - raise MergeError( + msg = ( "Merge keys are not unique in either left " "or right dataset; not a one-to-one merge" ) + msg += _get_dupes_msg(self.left_join_keys, "left") + msg += _get_dupes_msg(self.right_join_keys, "right") + raise MergeError(msg) if not left_unique: - raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-one merge" - ) + msg = "Merge keys are not unique in left dataset; not a one-to-one merge" + msg += _get_dupes_msg(self.left_join_keys, "left") + raise MergeError(msg) if not right_unique: - raise MergeError( + msg = ( "Merge keys are not unique in right dataset; not a one-to-one merge" ) + msg += _get_dupes_msg(self.right_join_keys, "right") + raise MergeError(msg) elif validate in ["one_to_many", "1:m"]: if not left_unique: - raise MergeError( + msg = ( "Merge keys are not unique in left dataset; not a one-to-many merge" ) + msg += _get_dupes_msg(self.left_join_keys, "left") + raise MergeError(msg) elif validate in ["many_to_one", "m:1"]: if not right_unique: - raise MergeError( + msg = ( "Merge keys are not unique in right dataset; " "not a many-to-one merge" ) + msg += _get_dupes_msg(self.right_join_keys, "right") + raise MergeError(msg) elif validate in ["many_to_many", "m:m"]: pass @@ -2009,7 +2040,6 @@ def _validate_validate_kwd(self, validate: str) -> None: '- "many_to_many"' ) - def get_join_indexers( left_keys: list[ArrayLike], right_keys: list[ArrayLike], diff --git a/reproduce.py b/reproduce.py new file mode 100644 index 0000000000000..28a6d5b473fc9 --- /dev/null +++ b/reproduce.py @@ -0,0 +1,7 @@ +import pandas as pd + +df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}) +try: + df.merge(df, on="a", validate="one_to_one") +except pd.errors.MergeError as e: + print(e)