@@ -3520,70 +3520,138 @@ def join(
3520
3520
* ,
3521
3521
on : Optional [str ] = None ,
3522
3522
how : str = "left" ,
3523
+ lsuffix : str = "" ,
3524
+ rsuffix : str = "" ,
3523
3525
) -> DataFrame :
3524
3526
if isinstance (other , bigframes .series .Series ):
3525
3527
other = other .to_frame ()
3526
3528
3527
3529
left , right = self , other
3528
3530
3529
- if not left .columns .intersection (right .columns ).empty :
3530
- raise NotImplementedError (
3531
- f"Deduping column names is not implemented. { constants .FEEDBACK_LINK } "
3532
- )
3531
+ col_intersection = left .columns .intersection (right .columns )
3532
+
3533
+ if not col_intersection .empty :
3534
+ if lsuffix == rsuffix == "" :
3535
+ raise ValueError (
3536
+ f"columns overlap but no suffix specified: { col_intersection } "
3537
+ )
3538
+
3533
3539
if how == "cross" :
3534
3540
if on is not None :
3535
3541
raise ValueError ("'on' is not supported for cross join." )
3536
3542
result_block = left ._block .merge (
3537
3543
right ._block ,
3538
3544
left_join_ids = [],
3539
3545
right_join_ids = [],
3540
- suffixes = ("" , "" ),
3546
+ suffixes = (lsuffix , rsuffix ),
3541
3547
how = "cross" ,
3542
3548
sort = True ,
3543
3549
)
3544
3550
return DataFrame (result_block )
3545
3551
3546
3552
# Join left columns with right index
3547
3553
if on is not None :
3554
+ if left ._has_index and (on in left .index .names ):
3555
+ if on in left .columns :
3556
+ raise ValueError (
3557
+ f"'{ on } ' is both an index level and a column label, which is ambiguous."
3558
+ )
3559
+ else :
3560
+ raise NotImplementedError (
3561
+ f"Joining on index level '{ on } ' is not yet supported. { constants .FEEDBACK_LINK } "
3562
+ )
3563
+ if (left .columns == on ).sum () > 1 :
3564
+ raise ValueError (f"The column label '{ on } ' is not unique." )
3565
+
3548
3566
if other ._block .index .nlevels != 1 :
3549
3567
raise ValueError (
3550
3568
"Join on columns must match the index level of the other DataFrame. Join on column with multi-index haven't been supported."
3551
3569
)
3552
- # Switch left index with on column
3553
- left_columns = left .columns
3554
- left_idx_original_names = left .index .names if left ._has_index else ()
3555
- left_idx_names_in_cols = [
3556
- f"bigframes_left_idx_name_{ i } "
3557
- for i in range (len (left_idx_original_names ))
3558
- ]
3559
- if left ._has_index :
3560
- left .index .names = left_idx_names_in_cols
3561
- left = left .reset_index (drop = False )
3562
- left = left .set_index (on )
3563
-
3564
- # Join on index and switch back
3565
- combined_df = left ._perform_join_by_index (right , how = how )
3566
- combined_df .index .name = on
3567
- combined_df = combined_df .reset_index (drop = False )
3568
- combined_df = combined_df .set_index (left_idx_names_in_cols )
3569
-
3570
- # To be consistent with Pandas
3571
- if combined_df ._has_index :
3572
- combined_df .index .names = (
3573
- left_idx_original_names
3574
- if how in ("inner" , "left" )
3575
- else ([None ] * len (combined_df .index .names ))
3576
- )
3577
3570
3578
- # Reorder columns
3579
- combined_df = combined_df [list (left_columns ) + list (right .columns )]
3580
- return combined_df
3571
+ return self ._join_on_key (
3572
+ other ,
3573
+ on = on ,
3574
+ how = how ,
3575
+ lsuffix = lsuffix ,
3576
+ rsuffix = rsuffix ,
3577
+ should_duplicate_on_key = (on in col_intersection ),
3578
+ )
3581
3579
3582
3580
# Join left index with right index
3583
3581
if left ._block .index .nlevels != right ._block .index .nlevels :
3584
3582
raise ValueError ("Index to join on must have the same number of levels." )
3585
3583
3586
- return left ._perform_join_by_index (right , how = how )
3584
+ return left ._perform_join_by_index (right , how = how )._add_join_suffix (
3585
+ left .columns , right .columns , lsuffix = lsuffix , rsuffix = rsuffix
3586
+ )
3587
+
3588
+ def _join_on_key (
3589
+ self ,
3590
+ other : DataFrame ,
3591
+ on : str ,
3592
+ how : str ,
3593
+ lsuffix : str ,
3594
+ rsuffix : str ,
3595
+ should_duplicate_on_key : bool ,
3596
+ ) -> DataFrame :
3597
+ left , right = self .copy (), other
3598
+ # Replace all columns names with unique names for reordering.
3599
+ left_col_original_names = left .columns
3600
+ on_col_name = "bigframes_left_col_on"
3601
+ dup_on_col_name = "bigframes_left_col_on_dup"
3602
+ left_col_temp_names = [
3603
+ f"bigframes_left_col_name_{ i } " if col_name != on else on_col_name
3604
+ for i , col_name in enumerate (left_col_original_names )
3605
+ ]
3606
+ left .columns = pandas .Index (left_col_temp_names )
3607
+ # if on column is also in right df, we need to duplicate the column
3608
+ # and set it to be the first column
3609
+ if should_duplicate_on_key :
3610
+ left [dup_on_col_name ] = left [on_col_name ]
3611
+ on_col_name = dup_on_col_name
3612
+ left_col_temp_names = [on_col_name ] + left_col_temp_names
3613
+ left = left [left_col_temp_names ]
3614
+
3615
+ # Switch left index with on column
3616
+ left_idx_original_names = left .index .names if left ._has_index else ()
3617
+ left_idx_names_in_cols = [
3618
+ f"bigframes_left_idx_name_{ i } " for i in range (len (left_idx_original_names ))
3619
+ ]
3620
+ if left ._has_index :
3621
+ left .index .names = left_idx_names_in_cols
3622
+ left = left .reset_index (drop = False )
3623
+ left = left .set_index (on_col_name )
3624
+
3625
+ right_col_original_names = right .columns
3626
+ right_col_temp_names = [
3627
+ f"bigframes_right_col_name_{ i } "
3628
+ for i in range (len (right_col_original_names ))
3629
+ ]
3630
+ right .columns = pandas .Index (right_col_temp_names )
3631
+
3632
+ # Join on index and switch back
3633
+ combined_df = left ._perform_join_by_index (right , how = how )
3634
+ combined_df .index .name = on_col_name
3635
+ combined_df = combined_df .reset_index (drop = False )
3636
+ combined_df = combined_df .set_index (left_idx_names_in_cols )
3637
+
3638
+ # To be consistent with Pandas
3639
+ if combined_df ._has_index :
3640
+ combined_df .index .names = (
3641
+ left_idx_original_names
3642
+ if how in ("inner" , "left" )
3643
+ else ([None ] * len (combined_df .index .names ))
3644
+ )
3645
+
3646
+ # Reorder columns
3647
+ combined_df = combined_df [left_col_temp_names + right_col_temp_names ]
3648
+ return combined_df ._add_join_suffix (
3649
+ left_col_original_names ,
3650
+ right_col_original_names ,
3651
+ lsuffix = lsuffix ,
3652
+ rsuffix = rsuffix ,
3653
+ extra_col = on if on_col_name == dup_on_col_name else None ,
3654
+ )
3587
3655
3588
3656
def _perform_join_by_index (
3589
3657
self ,
@@ -3597,6 +3665,59 @@ def _perform_join_by_index(
3597
3665
)
3598
3666
return DataFrame (block )
3599
3667
3668
+ def _add_join_suffix (
3669
+ self ,
3670
+ left_columns ,
3671
+ right_columns ,
3672
+ lsuffix : str = "" ,
3673
+ rsuffix : str = "" ,
3674
+ extra_col : typing .Optional [str ] = None ,
3675
+ ):
3676
+ """Applies suffixes to overlapping column names to mimic a pandas join.
3677
+
3678
+ This method identifies columns that are common to both a "left" and "right"
3679
+ set of columns and renames them using the provided suffixes. Columns that
3680
+ are not in the intersection are kept with their original names.
3681
+
3682
+ Args:
3683
+ left_columns (pandas.Index):
3684
+ The column labels from the left DataFrame.
3685
+ right_columns (pandas.Index):
3686
+ The column labels from the right DataFrame.
3687
+ lsuffix (str):
3688
+ The suffix to apply to overlapping column names from the left side.
3689
+ rsuffix (str):
3690
+ The suffix to apply to overlapping column names from the right side.
3691
+ extra_col (typing.Optional[str]):
3692
+ An optional column name to prepend to the final list of columns.
3693
+ This argument is used specifically to match the behavior of a
3694
+ pandas join. When a join key (i.e., the 'on' column) exists
3695
+ in both the left and right DataFrames, pandas creates two versions
3696
+ of that column: one copy keeps its original name and is placed as
3697
+ the first column, while the other instances receive the normal
3698
+ suffix. Passing the join key's name here replicates that behavior.
3699
+
3700
+ Returns:
3701
+ DataFrame:
3702
+ A new DataFrame with the columns renamed to resolve overlaps.
3703
+ """
3704
+ combined_df = self .copy ()
3705
+ col_intersection = left_columns .intersection (right_columns )
3706
+ final_col_names = [] if extra_col is None else [extra_col ]
3707
+ for col_name in left_columns :
3708
+ if col_name in col_intersection :
3709
+ final_col_names .append (f"{ col_name } { lsuffix } " )
3710
+ else :
3711
+ final_col_names .append (col_name )
3712
+
3713
+ for col_name in right_columns :
3714
+ if col_name in col_intersection :
3715
+ final_col_names .append (f"{ col_name } { rsuffix } " )
3716
+ else :
3717
+ final_col_names .append (col_name )
3718
+ combined_df .columns = pandas .Index (final_col_names )
3719
+ return combined_df
3720
+
3600
3721
@validations .requires_ordering ()
3601
3722
def rolling (
3602
3723
self ,
0 commit comments