diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4ab20623cc561..defe4710ad3fd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1022,11 +1022,11 @@ Performance improvements - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) - Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`) +- Performance improvement in :meth:`merge` and ``DataFrame.merge``.Now user can use prefixes or both of suffixes and prefixes to differentiate duplicated columns. (:issue:`63014`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) - .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: @@ -1044,6 +1044,7 @@ Categorical - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) + Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8c246434f6d8..f92920661694f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -368,6 +368,15 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). +diff_option : Literal + The allowed values are "suffix"、"prefix"、"both",default "suffix". + If the value is "suffix", the duplicated columns will be differentiated + using the suffixes provided by parameter "suffixes". + If the value is "prefix", the duplicated columns will be differentiated + using the prefixes provided by parameter "prefixes". + If the value is "both", the duplicated columns will be differentiated + using both the suffixes provided by parameter "suffixes" and + the prefixes provided by parameter "prefixes". suffixes : list-like, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in @@ -375,6 +384,13 @@ of a string to indicate that the column name from `left` or `right` should be left as-is, with no suffix. At least one of the values must not be None. +prefixes : list-like, default is (``"a_"``, ``"b_"``) + A length-2 sequence where each element is optionally a string + indicating the prefix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no prefix. At least one of the + values must not be None. copy : bool, default False If False, avoid copy if possible. @@ -11437,7 +11453,9 @@ def merge( left_index: bool = False, right_index: bool = False, sort: bool = False, + diff_option: Literal["prefix", "suffix", "both"] = "suffix", suffixes: Suffixes = ("_x", "_y"), + prefixes: Sequence[str | None] = ("a_", "b_"), copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: MergeValidate | None = None, @@ -11456,7 +11474,9 @@ def merge( left_index=left_index, right_index=right_index, sort=sort, + diff_option=diff_option, suffixes=suffixes, + prefixes=prefixes, indicator=indicator, validate=validate, ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0de300dcaf55f..68349a83e2e86 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -153,7 +153,11 @@ def merge( left_index: bool = False, right_index: bool = False, sort: bool = False, + diff_option: Literal[ + "prefix", "suffix", "both" + ] = "suffix", # add new parameter prefixes diff_option suffixes: Suffixes = ("_x", "_y"), + prefixes: Sequence[str | None] = ("a_", "b_"), # add new parameter prefixes copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: str | None = None, @@ -221,6 +225,15 @@ def merge( sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). + diff_option : Literal str + The allowed values are "suffix"、"prefix"、"both",default "suffix". + If the value is "suffix", the duplicated columns will be differentiated + using the suffixes provided by parameter "suffixes". + If the value is "prefix", the duplicated columns will be differentiated + using the prefixes provided by parameter "prefixes". + If the value is "both", the duplicated columns will be differentiated + using both the suffixes provided by parameter "suffixes" and + the prefixes provided by parameter "prefixes". suffixes : list-like, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in @@ -228,6 +241,13 @@ def merge( of a string to indicate that the column name from `left` or `right` should be left as-is, with no suffix. At least one of the values must not be None. + prefixes : list-like, default is (``"a_"``, ``"b_"``) + A length-2 sequence where each element is optionally a string + indicating the prefix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no prefix. At least one of the + values must not be None. copy : bool, default False If False, avoid copy if possible. @@ -370,6 +390,13 @@ def merge( left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) + + if diff_option != "prefix" and diff_option != "suffix" and diff_option != "both": + raise ValueError( + "Parameter 'diff_option' is wrong, please choose from 'prefix'" + ", 'suffix' and 'both'." + ) + if how == "cross": return _cross_merge( left_df, @@ -380,7 +407,9 @@ def merge( left_index=left_index, right_index=right_index, sort=sort, + diff_option=diff_option, suffixes=suffixes, + prefixes=prefixes, indicator=indicator, validate=validate, ) @@ -395,7 +424,9 @@ def merge( left_index=left_index, right_index=right_index, sort=sort, + diff_option=diff_option, suffixes=suffixes, + prefixes=prefixes, indicator=indicator, validate=validate, ) @@ -411,7 +442,11 @@ def _cross_merge( left_index: bool = False, right_index: bool = False, sort: bool = False, + diff_option: Literal[ + "prefix", "suffix", "both" + ] = "suffix", # add new parameter prefixes diff_option suffixes: Suffixes = ("_x", "_y"), + prefixes: Sequence[str | None] = ("a_", "b_"), # add new parameter prefixes indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: @@ -447,7 +482,9 @@ def _cross_merge( left_index=left_index, right_index=right_index, sort=sort, + diff_option=diff_option, suffixes=suffixes, + prefixes=prefixes, indicator=indicator, validate=validate, ) @@ -954,7 +991,9 @@ class _MergeOperation: left_index: bool right_index: bool sort: bool + diff_option: Literal["prefix", "suffix", "both"] suffixes: Suffixes + prefixes: Sequence[str | None] indicator: str | bool validate: str | None join_names: list[Hashable] @@ -972,7 +1011,11 @@ def __init__( left_index: bool = False, right_index: bool = False, sort: bool = True, + diff_option: Literal[ + "prefix", "suffix", "both" + ] = "suffix", # add new parameter prefixes diff_option suffixes: Suffixes = ("_x", "_y"), + prefixes: Sequence[str | None] = ("a_", "b_"), # add new parameter prefixes indicator: str | bool = False, validate: str | None = None, ) -> None: @@ -985,6 +1028,8 @@ def __init__( self.on = com.maybe_make_list(on) self.suffixes = suffixes + self.prefixes = prefixes + self.diff_option = diff_option self.sort = sort or how == "outer" self.left_index = left_index @@ -1094,8 +1139,12 @@ def _reindex_and_concat( left = self.left[:] right = self.right[:] - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, self.right._info_axis, self.suffixes + llabels, rlabels = _items_overlap_with_suffix_or_prefix( + self.left._info_axis, + self.right._info_axis, + self.suffixes, + self.prefixes, + self.diff_option, ) if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): @@ -3059,54 +3108,84 @@ def _validate_operand(obj: DataFrame | Series) -> DataFrame: ) -def _items_overlap_with_suffix( - left: Index, right: Index, suffixes: Suffixes +def _items_overlap_with_suffix_or_prefix( + left: Index, + right: Index, + suffixes: Suffixes, + prefixes: Sequence[str | None], + diff_option: Literal["prefix", "suffix", "both"], ) -> tuple[Index, Index]: """ - Suffixes type validation. + Suffixes and Prefixes type validation. - If two indices overlap, add suffixes to overlapping entries. + If two indices overlap, add suffixes and prefixes to overlapping entries. - If corresponding suffix is empty, the entry is simply converted to string. + If corresponding suffix and prefix are empty, + the entry is simply converted to string. """ - if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict): + if (diff_option == "both" or diff_option == "suffix") and ( + not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict) + ): raise TypeError( f"Passing 'suffixes' as a {type(suffixes)}, is not supported. " "Provide 'suffixes' as a tuple instead." ) + if (diff_option == "both" or diff_option == "prefix") and ( + not is_list_like(prefixes, allow_sets=False) or isinstance(prefixes, dict) + ): + raise TypeError( + f"Passing 'prefixes' as a {type(prefixes)}, is not supported. " + "Provide 'prefixes' as a tuple instead." + ) to_rename = left.intersection(right) if len(to_rename) == 0: return left, right - lsuffix, rsuffix = suffixes + if diff_option == "both" or diff_option == "suffix": + lsuffix, rsuffix = suffixes + else: + lsuffix, rsuffix = None, None - if not lsuffix and not rsuffix: - raise ValueError(f"columns overlap but no suffix specified: {to_rename}") + if diff_option == "both" or diff_option == "prefix": + lprefix, rprefix = prefixes + else: + lprefix, rprefix = None, None + + if not lsuffix and not rsuffix and not lprefix and not rprefix: + raise ValueError( + f"columns overlap but no suffix or prefix specified: {to_rename}" + ) - def renamer(x, suffix: str | None): + def renamer(x, suffix: str | None, prefix: str | None): """ Rename the left and right indices. - If there is overlap, and suffix is not None, add - suffix, otherwise, leave it as-is. + If there is overlap, and suffix or prefix is not None, add + suffix or prefix(or both if both are provided), otherwise, leave it as-is. Parameters ---------- x : original column name suffix : str or None + prefix : str or None Returns ------- x : renamed column name """ - if x in to_rename and suffix is not None: - return f"{x}{suffix}" + ret = x + if x in to_rename: + if suffix is not None: + ret = f"{ret}{suffix}" + if prefix is not None: + ret = f"{prefix}{ret}" + return ret return x - lrenamer = partial(renamer, suffix=lsuffix) - rrenamer = partial(renamer, suffix=rsuffix) + lrenamer = partial(renamer, suffix=lsuffix, prefix=lprefix) + rrenamer = partial(renamer, suffix=rsuffix, prefix=rprefix) llabels = left._transform_index(lrenamer) rlabels = right._transform_index(rrenamer) @@ -3123,7 +3202,8 @@ def renamer(x, suffix: str | None): dups.extend(rlabels.intersection(left.difference(to_rename)).tolist()) if dups: raise MergeError( - f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " + f"Passing 'suffixes' or/and 'prefixes' " + f"which cause duplicate columns {set(dups)} is " "not allowed.", ) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index c866b81a5349e..4011373864c52 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -783,7 +783,7 @@ def test_join_dups(self): # GH 40991: As of 2.0 causes duplicate columns with pytest.raises( pd.errors.MergeError, - match="Passing 'suffixes' which cause duplicate columns", + match="Passing 'suffixes' or/and 'prefixes' which cause duplicate columns", ): dta.merge(w, left_index=True, right_index=True) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d188cb396a7da..83672fd512244 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2394,6 +2394,160 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + ( + 0, + 0, + {"prefixes": ("", "dup_"), "suffixes": ("", "_dup"), "diff_option": "both"}, + ["0", "dup_0_dup"], + ), + ( + 0, + 0, + { + "prefixes": (None, "dup_"), + "suffixes": (None, "_dup"), + "diff_option": "both", + }, + [0, "dup_0_dup"], + ), + ( + 0, + 0, + {"prefixes": ("x_", "y_"), "suffixes": ("_x", "_y"), "diff_option": "both"}, + ["x_0_x", "y_0_y"], + ), + ( + 0, + 0, + {"prefixes": ["x_", "y_"], "suffixes": ["_x", "_y"], "diff_option": "both"}, + ["x_0_x", "y_0_y"], + ), + ( + "a", + 0, + {"prefixes": (None, "y_"), "suffixes": (None, "_y"), "diff_option": "both"}, + ["a", 0], + ), + ( + 0.0, + 0.0, + {"prefixes": ("x_", None), "suffixes": ("_x", None), "diff_option": "both"}, + ["x_0.0_x", 0.0], + ), + ( + "b", + "b", + {"prefixes": (None, "y_"), "suffixes": (None, "_y"), "diff_option": "both"}, + ["b", "y_b_y"], + ), + ( + "a", + "a", + {"prefixes": ("x_", None), "suffixes": ("_x", None), "diff_option": "both"}, + ["x_a_x", "a"], + ), + ( + "a", + "b", + {"prefixes": ("x_", None), "suffixes": ("_x", None), "diff_option": "both"}, + ["a", "b"], + ), + ( + "a", + "a", + {"prefixes": (None, "x_"), "suffixes": (None, "_x"), "diff_option": "both"}, + ["a", "x_a_x"], + ), + ( + 0, + 0, + {"prefixes": ("a_", None), "suffixes": ("_a", None), "diff_option": "both"}, + ["a_0_a", 0], + ), + ("a", "a", {"diff_option": "both"}, ["a_a_x", "b_a_y"]), + (0, 0, {"diff_option": "both"}, ["a_0_x", "b_0_y"]), + (0.0, 0.0, {"diff_option": "both"}, ["a_0.0_x", "b_0.0_y"]), + ], +) +def test_merge_both(col1, col2, kwargs, expected_cols): + # issue: 24782 + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [4, 5, 6]}) + + expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) + + result = a.merge(b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + result = merge(a, b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + (0, 0, {"prefixes": ("", "dup_"), "diff_option": "prefix"}, ["0", "dup_0"]), + (0, 0, {"prefixes": (None, "dup_"), "diff_option": "prefix"}, [0, "dup_0"]), + (0, 0, {"prefixes": ("x_", "y_"), "diff_option": "prefix"}, ["x_0", "y_0"]), + (0, 0, {"prefixes": ["x_", "y_"], "diff_option": "prefix"}, ["x_0", "y_0"]), + ("a", 0, {"prefixes": (None, "y_"), "diff_option": "prefix"}, ["a", 0]), + (0.0, 0.0, {"prefixes": ("x_", None), "diff_option": "prefix"}, ["x_0.0", 0.0]), + ("b", "b", {"prefixes": (None, "y_"), "diff_option": "prefix"}, ["b", "y_b"]), + ("a", "a", {"prefixes": ("x_", None), "diff_option": "prefix"}, ["x_a", "a"]), + ("a", "b", {"prefixes": ("x_", None), "diff_option": "prefix"}, ["a", "b"]), + ("a", "a", {"prefixes": (None, "x_"), "diff_option": "prefix"}, ["a", "x_a"]), + (0, 0, {"prefixes": ("a_", None), "diff_option": "prefix"}, ["a_0", 0]), + ("a", "a", {"diff_option": "prefix"}, ["a_a", "b_a"]), + (0, 0, {"diff_option": "prefix"}, ["a_0", "b_0"]), + (0.0, 0.0, {"diff_option": "prefix"}, ["a_0.0", "b_0.0"]), + ], +) +def test_merge_prefix(col1, col2, kwargs, expected_cols): + # issue: 24782 + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [4, 5, 6]}) + + expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) + + result = a.merge(b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + result = merge(a, b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "how,expected", + [ + ( + "right", + {"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]}, + ), + ( + "outer", + { + "A": [1, 100, 200, 300], + "B1": [80, 60, 70, np.nan], + "B2": [np.nan, 600, 700, 800], + }, + ), + ], +) +def test_merge_duplicate_prefix(how, expected): + left_df = DataFrame({"A": [100, 200, 1], "B": [60, 70, 80]}) + right_df = DataFrame({"A": [100, 200, 300], "B": [600, 700, 800]}) + result = merge( + left_df, right_df, on="A", how=how, prefixes=("x_", "x_"), diff_option="prefix" + ) + expected = DataFrame(expected) + expected.columns = ["A", "x_B", "x_B"] + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how,expected", [ @@ -2431,11 +2585,62 @@ def test_merge_suffix_error(col1, col2, suffixes): b = DataFrame({col2: [3, 4, 5]}) # TODO: might reconsider current raise behaviour, see issue 24782 - msg = "columns overlap but no suffix specified" + msg = "columns overlap but no suffix or prefix specified" with pytest.raises(ValueError, match=msg): merge(a, b, left_index=True, right_index=True, suffixes=suffixes) +@pytest.mark.parametrize( + "col1, col2, prefixes", + [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))], +) +def test_merge_prefix_error(col1, col2, prefixes): + # issue: 24782 + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [3, 4, 5]}) + + # TODO: might reconsider current raise behaviour, see issue 24782 + msg = "columns overlap but no suffix or prefix specified" + with pytest.raises(ValueError, match=msg): + merge( + a, + b, + left_index=True, + right_index=True, + prefixes=prefixes, + diff_option="prefix", + ) + + +@pytest.mark.parametrize( + "col1, col2, prefixes, suffixes", + [ + ("a", "a", (None, None), (None, None)), + ("a", "a", ("", None), ("", None)), + (0, 0, (None, ""), (None, "")), + ("a", "a", ("", None), (None, "")), + (0, 0, (None, ""), ("", None)), + ], +) +def test_merge_both_error(col1, col2, prefixes, suffixes): + # issue: 24782 + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [3, 4, 5]}) + + # TODO: might reconsider current raise behaviour, see issue 24782 + msg = "columns overlap but no suffix or prefix specified" + with pytest.raises(ValueError, match=msg): + merge( + a, + b, + left_index=True, + right_index=True, + prefixes=prefixes, + suffixes=suffixes, + diff_option="both", + ) + + @pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) def test_merge_suffix_raises(suffixes): a = DataFrame({"a": [1, 2, 3]}) @@ -2445,6 +2650,22 @@ def test_merge_suffix_raises(suffixes): merge(a, b, left_index=True, right_index=True, suffixes=suffixes) +@pytest.mark.parametrize("prefixes", [{"left", "right"}, {"left": 0, "right": 0}]) +def test_merge_prefix_raises(prefixes): + a = DataFrame({"a": [1, 2, 3]}) + b = DataFrame({"b": [3, 4, 5]}) + + with pytest.raises(TypeError, match="Passing 'prefixes' as a"): + merge( + a, + b, + left_index=True, + right_index=True, + prefixes=prefixes, + diff_option="prefix", + ) + + TWO_GOT_THREE = "2, got 3" if PY314 else "2" @@ -2468,6 +2689,33 @@ def test_merge_suffix_length_error(col1, col2, suffixes, msg): merge(a, b, left_index=True, right_index=True, suffixes=suffixes) +@pytest.mark.parametrize( + "col1, col2, prefixes, msg", + [ + ( + "a", + "a", + ("a", "b", "c"), + (rf"too many values to unpack \(expected {TWO_GOT_THREE}\)"), + ), + ("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"), + ], +) +def test_merge_prefix_length_error(col1, col2, prefixes, msg): + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [3, 4, 5]}) + + with pytest.raises(ValueError, match=msg): + merge( + a, + b, + left_index=True, + right_index=True, + prefixes=prefixes, + diff_option="prefix", + ) + + @pytest.mark.parametrize("cat_dtype", ["one", "two"]) @pytest.mark.parametrize("reverse", [True, False]) def test_merge_equal_cat_dtypes(cat_dtype, reverse): @@ -2754,13 +3002,56 @@ def test_merge_suffixes_produce_dup_columns_raises(): left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2}) right = DataFrame({"a": [1, 2, 3], "b": 2}) - with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"): + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): merge(left, right, on="a") - with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"): + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): merge(right, left, on="a", suffixes=("_y", "_x")) +def test_merge_prefixes_produce_dup_columns_raises(): + # GH#22818; Enforced in 2.0 + left = DataFrame({"a": [1, 2, 3], "b": 1, "a_b": 2}) + right = DataFrame({"a": [1, 2, 3], "b": 2}) + + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): + merge(left, right, on="a", diff_option="prefix") + + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): + merge(right, left, on="a", prefixes=("b_", "a_"), diff_option="prefix") + + +def test_merge_both_produce_dup_columns_raises(): + # GH#22818; Enforced in 2.0 + left = DataFrame({"a": [1, 2, 3], "b": 1, "a_b_x": 2}) + right = DataFrame({"a": [1, 2, 3], "b": 2}) + + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): + merge(left, right, on="a", diff_option="both") + + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): + merge( + right, + left, + on="a", + prefixes=("", "a_"), + suffixes=("", "_x"), + diff_option="both", + ) + + def test_merge_duplicate_columns_with_suffix_no_warning(): # GH#22818 # Do not raise warning when duplicates are caused by duplicates in origin @@ -2771,15 +3062,59 @@ def test_merge_duplicate_columns_with_suffix_no_warning(): tm.assert_frame_equal(result, expected) +def test_merge_duplicate_columns_with_prefix_no_warning(): + # GH#22818 + # Do not raise warning when duplicates are caused by duplicates in origin + left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"]) + right = DataFrame({"a": [1, 3], "b": 2}) + result = merge(left, right, on="a", diff_option="prefix") + expected = DataFrame([[1, 1, 1, 2]], columns=["a", "a_b", "a_b", "b_b"]) + tm.assert_frame_equal(result, expected) + + +def test_merge_duplicate_columns_with_both_no_warning(): + # GH#22818 + # Do not raise warning when duplicates are caused by duplicates in origin + left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"]) + right = DataFrame({"a": [1, 3], "b": 2}) + result = merge(left, right, on="a", diff_option="both") + expected = DataFrame([[1, 1, 1, 2]], columns=["a", "a_b_x", "a_b_x", "b_b_y"]) + tm.assert_frame_equal(result, expected) + + def test_merge_duplicate_columns_with_suffix_causing_another_duplicate_raises(): # GH#22818, Enforced in 2.0 # This should raise warning because suffixes cause another collision left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"]) right = DataFrame({"a": [1, 3], "b": 2}) - with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"): + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): merge(left, right, on="a") +def test_merge_duplicate_columns_with_prefix_causing_another_duplicate_raises(): + # GH#22818, Enforced in 2.0 + # This should raise warning because suffixes cause another collision + left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "a_b"]) + right = DataFrame({"a": [1, 3], "b": 2}) + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): + merge(left, right, on="a", diff_option="prefix") + + +def test_merge_duplicate_columns_with_both_causing_another_duplicate_raises(): + # GH#22818, Enforced in 2.0 + # This should raise warning because suffixes cause another collision + left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "a_b_x"]) + right = DataFrame({"a": [1, 3], "b": 2}) + with pytest.raises( + MergeError, match="Passing 'suffixes' or/and 'prefixes' which cause duplicate" + ): + merge(left, right, on="a", diff_option="both") + + def test_merge_string_float_column_result(): # GH 13353 df1 = DataFrame([[1, 2], [3, 4]], columns=Index(["a", 114.0])) @@ -3105,6 +3440,23 @@ def test_merge_for_suffix_collisions(suffixes): merge(df1, df2, on="col1", suffixes=suffixes) +@pytest.mark.parametrize("prefixes", [("dup_", ""), ("", "dup_")]) +def test_merge_for_prefix_collisions(prefixes): + # GH#61402 + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "dup_col2": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", prefixes=prefixes, diff_option="prefix") + + +def test_merge_wrong_diff_option(): + prefixes = ("ax_", "bx_") + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2]}) + with pytest.raises(ValueError, match="Parameter 'diff_option' is wrong"): + merge(df1, df2, on="col1", prefixes=prefixes, diff_option="else") + + def test_merge_categorical_key_recursion(): # GH#56376 lt = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="int64")) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 7ae2fffa04205..cf0e3a0fe2276 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -652,7 +652,9 @@ def test_join_multi_levels_invalid(self, portfolio, household): portfolio2 = portfolio.copy() portfolio2.index.set_names(["household_id", "foo"]) - with pytest.raises(ValueError, match="columns overlap but no suffix specified"): + with pytest.raises( + ValueError, match="columns overlap but no suffix or prefix specified" + ): portfolio2.join(portfolio, how="inner") def test_join_multi_levels2(self):