Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[user]
email = [email protected]
name = test
[pull]
rebase = false
[push]
default = simple
46 changes: 38 additions & 8 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1963,34 +1963,65 @@ def _validate_validate_kwd(self, validate: str) -> None:
else:
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique

def _get_dupes_msg(keys: list, side: str) -> str:
# Helper to get a message for duplicate keys
keys_mi = MultiIndex.from_arrays(keys)
dupes = keys_mi[keys_mi.duplicated()].unique()

# PR description says to cut off after a few values
max_dupes_to_show = 5
if len(dupes) > max_dupes_to_show:
dupes_to_show = dupes[:max_dupes_to_show]
extra_msg = f", showing first {max_dupes_to_show}"
else:
dupes_to_show = dupes
extra_msg = ""

# Show tuples for MultiIndex, but single values for single key
if keys_mi.nlevels == 1:
dupes_str = ", ".join(map(str, dupes_to_show.get_level_values(0)))
else:
dupes_str = ", ".join(map(str, list(dupes_to_show)))

return f". Duplicates in {side} key(s): [{dupes_str}]{extra_msg}."

# Check data integrity
if validate in ["one_to_one", "1:1"]:
if not left_unique and not right_unique:
raise MergeError(
msg = (
"Merge keys are not unique in either left "
"or right dataset; not a one-to-one merge"
)
msg += _get_dupes_msg(self.left_join_keys, "left")
msg += _get_dupes_msg(self.right_join_keys, "right")
raise MergeError(msg)
if not left_unique:
raise MergeError(
"Merge keys are not unique in left dataset; not a one-to-one merge"
)
msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
msg += _get_dupes_msg(self.left_join_keys, "left")
raise MergeError(msg)
if not right_unique:
raise MergeError(
msg = (
"Merge keys are not unique in right dataset; not a one-to-one merge"
)
msg += _get_dupes_msg(self.right_join_keys, "right")
raise MergeError(msg)

elif validate in ["one_to_many", "1:m"]:
if not left_unique:
raise MergeError(
msg = (
"Merge keys are not unique in left dataset; not a one-to-many merge"
)
msg += _get_dupes_msg(self.left_join_keys, "left")
raise MergeError(msg)

elif validate in ["many_to_one", "m:1"]:
if not right_unique:
raise MergeError(
msg = (
"Merge keys are not unique in right dataset; "
"not a many-to-one merge"
)
msg += _get_dupes_msg(self.right_join_keys, "right")
raise MergeError(msg)

elif validate in ["many_to_many", "m:m"]:
pass
Expand All @@ -2009,7 +2040,6 @@ def _validate_validate_kwd(self, validate: str) -> None:
'- "many_to_many"'
)


def get_join_indexers(
left_keys: list[ArrayLike],
right_keys: list[ArrayLike],
Expand Down
7 changes: 7 additions & 0 deletions reproduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pandas as pd

df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]})
try:
df.merge(df, on="a", validate="one_to_one")
except pd.errors.MergeError as e:
print(e)