4646from datafusion .plan import ExecutionPlan , LogicalPlan
4747from datafusion .record_batch import RecordBatchStream
4848
49+ from .functions import coalesce , col
50+
4951if TYPE_CHECKING :
5052 import pathlib
5153 from typing import Callable , Sequence
@@ -77,6 +79,31 @@ class JoinPreparation:
7779 drop_cols : list [str ]
7880
7981
82+ def _deduplicate_right (
83+ right : DataFrame , columns : Sequence [str ]
84+ ) -> tuple [DataFrame , list [str ]]:
85+ """Rename join columns on the right DataFrame for deduplication."""
86+ existing_columns = set (right .schema ().names )
87+ modified = right
88+ aliases : list [str ] = []
89+
90+ for col_name in columns :
91+ base_alias = f"__right_{ col_name } "
92+ alias = base_alias
93+ counter = 0
94+ while alias in existing_columns :
95+ counter += 1
96+ alias = f"{ base_alias } _{ counter } "
97+ if alias in existing_columns :
98+ alias = f"__temp_{ uuid .uuid4 ().hex [:8 ]} _{ col_name } "
99+
100+ modified = modified .with_column_renamed (col_name , alias )
101+ aliases .append (alias )
102+ existing_columns .add (alias )
103+
104+ return modified , aliases
105+
106+
80107# excerpt from deltalake
81108# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163
82109class Compression (Enum ):
@@ -730,10 +757,23 @@ def join(
730757 join_preparation .join_keys .right_names ,
731758 )
732759 )
733-
760+
761+ if (
762+ deduplicate
763+ and how in ("right" , "full" )
764+ and join_preparation .join_keys .on is not None
765+ ):
766+ for left_name , right_alias in zip (
767+ join_preparation .join_keys .left_names ,
768+ join_preparation .drop_cols ,
769+ ):
770+ result = result .with_column (
771+ left_name , coalesce (col (left_name ), col (right_alias ))
772+ )
773+
734774 if join_preparation .drop_cols :
735775 result = result .drop (* join_preparation .drop_cols )
736-
776+
737777 return result
738778
739779 def _prepare_join (
@@ -746,18 +786,18 @@ def _prepare_join(
746786 deduplicate : bool ,
747787 ) -> JoinPreparation :
748788 """Prepare join keys and handle deduplication if requested.
749-
789+
750790 This method combines join key resolution and deduplication preparation
751791 to avoid parameter handling duplication and provide a unified interface.
752-
792+
753793 Args:
754794 right: The right DataFrame to join with.
755795 on: Column names to join on in both dataframes.
756796 left_on: Join column of the left dataframe.
757797 right_on: Join column of the right dataframe.
758798 join_keys: Tuple of two lists of column names to join on. [Deprecated]
759799 deduplicate: If True, prepare right DataFrame for column deduplication.
760-
800+
761801 Returns:
762802 JoinPreparation containing resolved join keys, modified right DataFrame,
763803 and columns to drop after joining.
@@ -787,71 +827,41 @@ def _prepare_join(
787827
788828 if resolved_on is not None :
789829 if left_on is not None or right_on is not None :
790- error_msg = (
791- "`left_on` or `right_on` should not be provided with `on`. "
792- "Note: `deduplicate` must be specified as a keyword argument."
793- )
830+ error_msg = "`left_on` or `right_on` should not provided with `on`"
794831 raise ValueError (error_msg )
795832 left_on = resolved_on
796833 right_on = resolved_on
797834 elif left_on is not None or right_on is not None :
798835 if left_on is None or right_on is None :
799- error_msg = (
800- "`left_on` and `right_on` should both be provided. "
801- "Note: `deduplicate` must be specified as a keyword argument."
802- )
836+ error_msg = "`left_on` and `right_on` should both be provided."
803837 raise ValueError (error_msg )
804838 else :
805- error_msg = (
806- "Either `on` or both `left_on` and `right_on` should be provided. "
807- "Note: `deduplicate` must be specified as a keyword argument."
808- )
839+ error_msg = "either `on` or `left_on` and `right_on` should be provided."
809840 raise ValueError (error_msg )
810841
811842 # At this point, left_on and right_on are guaranteed to be non-None
812- assert left_on is not None and right_on is not None
813-
843+ if left_on is None or right_on is None : # pragma: no cover - sanity check
844+ msg = "join keys resolved to None"
845+ raise ValueError (msg )
846+
814847 left_names = [left_on ] if isinstance (left_on , str ) else list (left_on )
815848 right_names = [right_on ] if isinstance (right_on , str ) else list (right_on )
816-
817- join_keys_resolved = JoinKeys (
818- on = resolved_on , left_names = left_names , right_names = right_names
819- )
820-
821- # Step 2: Handle deduplication if requested
849+
822850 drop_cols : list [str ] = []
823851 modified_right = right
824-
852+
825853 if deduplicate and resolved_on is not None :
826- # Prepare deduplication by renaming columns in the right DataFrame
827- on_cols = [resolved_on ] if isinstance (resolved_on , str ) else list (resolved_on )
828-
829- # Get existing column names to avoid collisions
830- existing_columns = set (right .schema ().names )
831-
832- for col_name in on_cols :
833- # Generate a collision-safe temporary alias
834- base_alias = f"__right_{ col_name } "
835- alias = base_alias
836- counter = 0
837-
838- # Keep trying until we find a unique name
839- while alias in existing_columns :
840- counter += 1
841- alias = f"{ base_alias } _{ counter } "
842-
843- # If even that fails (very unlikely), use UUID
844- if alias in existing_columns :
845- alias = f"__temp_{ uuid .uuid4 ().hex [:8 ]} _{ col_name } "
846-
847- modified_right = modified_right .with_column_renamed (col_name , alias )
848- drop_cols .append (alias )
849- # Add the new alias to existing columns to avoid future collisions
850- existing_columns .add (alias )
851-
852- # Update right_names to use the new aliases
853- right_names = drop_cols .copy ()
854-
854+ on_cols = (
855+ [resolved_on ] if isinstance (resolved_on , str ) else list (resolved_on )
856+ )
857+ modified_right , aliases = _deduplicate_right (right , on_cols )
858+ drop_cols .extend (aliases )
859+ right_names = aliases .copy ()
860+
861+ join_keys_resolved = JoinKeys (
862+ on = resolved_on , left_names = left_names , right_names = right_names
863+ )
864+
855865 return JoinPreparation (
856866 join_keys = join_keys_resolved ,
857867 modified_right = modified_right ,
0 commit comments