fix: handle case where join keys are different for sort-merge multi-partition join (#6243)

gweaverbiodev · web-flow · commit 7b81e3bae447 · 2026-02-24T15:24:39.000-08:00
## Changes Made

the current sort merge join (multi partition) implementation does not
correctly handle the case where the join keys in the left and right
dataframes are different. this PR fixes this issue by doing the
following:

- aliasing the right keys when generating the samples for determining
boundaries
- renames materialized `boundaries` with right keys when applying the
boundaries to create range partition tasks
- regression test added to ensure fix works

---------

Co-authored-by: gmweaver &lt;gmweaver.usc@gmail.com&gt;
diff --git a/src/daft-distributed/src/pipeline_node/join/sort_merge_join.rs b/src/daft-distributed/src/pipeline_node/join/sort_merge_join.rs
@@ -5,6 +5,7 @@ use common_metrics::ops::{NodeCategory, NodeType};
 use daft_dsl::expr::bound_expr::BoundExpr;
 use daft_local_plan::{LocalNodeContext, LocalPhysicalPlan};
 use daft_logical_plan::{JoinType, stats::StatsState};
+use daft_recordbatch::RecordBatch;
 use daft_schema::schema::SchemaRef;
 use futures::{TryStreamExt, future::try_join_all};
 
@@ -174,18 +175,35 @@ impl SortMergeJoinNode {
             scheduler_handle,
         )?;
 
+        let left_boundary_key_names = self
+            .left_on
+            .iter()
+            .map(|expr| {
+                expr.inner()
+                    .to_field(&self.left.config().schema)
+                    .map(|f| f.name)
+            })
+            .collect::<DaftResult<Vec<_>>>()?;
+
+        let right_sample_by_aliased = self
+            .right_on
+            .iter()
+            .zip(left_boundary_key_names.into_iter())
+            .map(|(expr, key_name)| BoundExpr::new_unchecked(expr.inner().alias(key_name)))
+            .collect::<Vec<_>>();
+
         // Sample right side
         let right_sample_tasks = create_sample_tasks(
             right_materialized.clone(),
             self.right.config().schema.clone(),
-            self.right_on.clone(),
+            right_sample_by_aliased,
             self.as_ref(),
             task_id_counter,
             scheduler_handle,
         )?;
 
         // Collect all samples
-        let sampled_outputs = try_join_all(
+        let combined_sampled_outputs = try_join_all(
             left_sample_tasks
                 .into_iter()
                 .chain(right_sample_tasks.into_iter()),
@@ -196,8 +214,8 @@ impl SortMergeJoinNode {
         .collect::<Vec<_>>();
 
         // Compute partition boundaries from combined samples
-        let boundaries = get_partition_boundaries_from_samples(
-            sampled_outputs,
+        let left_partition_boundaries = get_partition_boundaries_from_samples(
+            combined_sampled_outputs,
             &self.left_on,
             descending.clone(),
             nulls_first,
@@ -212,21 +230,40 @@ impl SortMergeJoinNode {
             left_schema,
             self.left_on.clone(),
             descending.clone(),
-            boundaries.clone(),
+            left_partition_boundaries.clone(),
             num_partitions,
             self.as_ref(),
             task_id_counter,
             scheduler_handle,
         )?;
 
+        let right_boundary_names = self
+            .right_on
+            .iter()
+            .map(|expr| {
+                expr.inner()
+                    .to_field(&self.right.config().schema)
+                    .map(|f| f.name)
+            })
+            .collect::<DaftResult<Vec<_>>>()?;
+
+        let right_partition_boundaries = RecordBatch::from_nonempty_columns(
+            left_partition_boundaries
+                .columns()
+                .iter()
+                .zip(right_boundary_names)
+                .map(|(series, name)| series.clone().rename(name))
+                .collect::<Vec<_>>(),
+        )?;
+
         // Range repartition right side
         let right_schema = self.right.config().schema.clone();
         let right_partition_tasks = create_range_repartition_tasks(
             right_materialized,
             right_schema,
             self.right_on.clone(),
             descending,
-            boundaries,
+            right_partition_boundaries,
             num_partitions,
             self.as_ref(),
             task_id_counter,
diff --git a/tests/dataframe/test_joins.py b/tests/dataframe/test_joins.py
@@ -1273,6 +1273,41 @@ def test_sort_merge_join_small_partitions(make_df, with_default_morsel_size):
     assert pd["rv"] == [200, 300]
 
 
+@pytest.mark.parametrize("left_partitions,right_partitions", [(2, 2), (4, 3), (8, 4)])
+def test_sort_merge_join_different_left_right_keys(
+    left_partitions, right_partitions, make_df, with_default_morsel_size
+):
+    if get_tests_daft_runner_name() == "native":
+        pytest.skip("Sort-merge joins are not supported on native runner")
+
+    left = make_df(
+        {"left_k": [1, 2, 3], "lv": [10, 20, 30]},
+        repartition=left_partitions,
+        repartition_columns=["left_k"],
+    )
+    right = make_df(
+        {"right_k": [2, 3, 4], "rv": [200, 300, 400]},
+        repartition=right_partitions,
+        repartition_columns=["right_k"],
+    )
+
+    out = left.join(
+        right,
+        left_on="left_k",
+        right_on="right_k",
+        how="inner",
+        strategy="sort_merge",
+    ).sort("left_k")
+
+    pd = out.to_pydict()
+    assert pd == {
+        "left_k": [2, 3],
+        "lv": [20, 30],
+        "right_k": [2, 3],
+        "rv": [200, 300],
+    }
+
+
 @pytest.mark.parametrize(
     "suffix,prefix,expected",
     [