Skip to content

Commit a1fa112

Browse files
authored
fix: bpd.merge on common columns (#1905)
1 parent 84c96e1 commit a1fa112

File tree

2 files changed

+95
-10
lines changed

2 files changed

+95
-10
lines changed

bigframes/dataframe.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3434,15 +3434,9 @@ def merge(
34343434
)
34353435
return DataFrame(result_block)
34363436

3437-
if on is None:
3438-
if left_on is None or right_on is None:
3439-
raise ValueError("Must specify `on` or `left_on` + `right_on`.")
3440-
else:
3441-
if left_on is not None or right_on is not None:
3442-
raise ValueError(
3443-
"Can not pass both `on` and `left_on` + `right_on` params."
3444-
)
3445-
left_on, right_on = on, on
3437+
left_on, right_on = self._validate_left_right_on(
3438+
right, on, left_on=left_on, right_on=right_on
3439+
)
34463440

34473441
if utils.is_list_like(left_on):
34483442
left_on = list(left_on) # type: ignore
@@ -3479,6 +3473,41 @@ def merge(
34793473
)
34803474
return DataFrame(block)
34813475

3476+
def _validate_left_right_on(
3477+
self,
3478+
right: DataFrame,
3479+
on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
3480+
*,
3481+
left_on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
3482+
right_on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
3483+
):
3484+
if on is not None:
3485+
if left_on is not None or right_on is not None:
3486+
raise ValueError(
3487+
"Can not pass both `on` and `left_on` + `right_on` params."
3488+
)
3489+
return on, on
3490+
3491+
if left_on is not None and right_on is not None:
3492+
return left_on, right_on
3493+
3494+
left_cols = self.columns
3495+
right_cols = right.columns
3496+
common_cols = left_cols.intersection(right_cols)
3497+
if len(common_cols) == 0:
3498+
raise ValueError(
3499+
"No common columns to perform merge on."
3500+
f"Merge options: left_on={left_on}, "
3501+
f"right_on={right_on}, "
3502+
)
3503+
if (
3504+
not left_cols.join(common_cols, how="inner").is_unique
3505+
or not right_cols.join(common_cols, how="inner").is_unique
3506+
):
3507+
raise ValueError(f"Data columns not unique: {repr(common_cols)}")
3508+
3509+
return common_cols, common_cols
3510+
34823511
def join(
34833512
self,
34843513
other: Union[DataFrame, bigframes.series.Series],

tests/system/small/test_pandas.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from datetime import datetime
16+
import re
1617
import typing
1718

1819
import pandas as pd
@@ -343,7 +344,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how):
343344
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
344345

345346

346-
def test_pd_merge_cross(scalars_dfs):
347+
def test_merge_cross(scalars_dfs):
347348
scalars_df, scalars_pandas_df = scalars_dfs
348349
left_columns = ["int64_col", "float64_col", "int64_too"]
349350
right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
@@ -398,6 +399,61 @@ def test_merge_series(scalars_dfs, merge_how):
398399
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
399400

400401

402+
def test_merge_w_common_columns(scalars_dfs):
403+
scalars_df, scalars_pandas_df = scalars_dfs
404+
left_columns = ["int64_col", "int64_too"]
405+
right_columns = ["int64_col", "bool_col"]
406+
407+
df = bpd.merge(
408+
scalars_df[left_columns], scalars_df[right_columns], "inner", sort=True
409+
)
410+
411+
pd_result = pd.merge(
412+
scalars_pandas_df[left_columns],
413+
scalars_pandas_df[right_columns],
414+
"inner",
415+
sort=True,
416+
)
417+
assert_pandas_df_equal(df.to_pandas(), pd_result, ignore_order=True)
418+
419+
420+
def test_merge_raises_error_when_no_common_columns(scalars_dfs):
421+
scalars_df, _ = scalars_dfs
422+
left_columns = ["float64_col", "int64_too"]
423+
right_columns = ["int64_col", "bool_col"]
424+
425+
left = scalars_df[left_columns]
426+
right = scalars_df[right_columns]
427+
428+
with pytest.raises(
429+
ValueError,
430+
match="No common columns to perform merge on.",
431+
):
432+
bpd.merge(left, right, "inner")
433+
434+
435+
def test_merge_raises_error_when_left_right_on_set(scalars_dfs):
436+
scalars_df, _ = scalars_dfs
437+
left_columns = ["int64_col", "int64_too"]
438+
right_columns = ["int64_col", "bool_col"]
439+
440+
left = scalars_df[left_columns]
441+
right = scalars_df[right_columns]
442+
443+
with pytest.raises(
444+
ValueError,
445+
match=re.escape("Can not pass both `on` and `left_on` + `right_on` params."),
446+
):
447+
bpd.merge(
448+
left,
449+
right,
450+
"inner",
451+
left_on="int64_too",
452+
right_on="int64_col",
453+
on="int64_col",
454+
)
455+
456+
401457
def _convert_pandas_category(pd_s: pd.Series):
402458
"""
403459
Transforms a pandas Series with Categorical dtype into a bigframes-compatible

0 commit comments

Comments
 (0)