datafusion-contrib · gene-bordegaray · Jan 23, 2026 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/src/distributed_planner/distributed_physical_optimizer_rule.rs b/src/distributed_planner/distributed_physical_optimizer_rule.rs
@@ -3,9 +3,9 @@ use crate::distributed_planner::plan_annotator::{
     AnnotatedPlan, RequiredNetworkBoundary, annotate_plan,
 };
 use crate::{
-    DistributedConfig, DistributedExec, NetworkCoalesceExec, NetworkShuffleExec, TaskEstimator,
+    BroadcastExec, DistributedConfig, DistributedExec, NetworkBroadcastExec, NetworkCoalesceExec,
+    NetworkShuffleExec, TaskCountAnnotation, TaskEstimator,
 };
-use datafusion::common::internal_err;
 use datafusion::common::tree_node::{Transformed, TreeNode};
 use datafusion::config::ConfigOptions;
 use datafusion::error::DataFusionError;
@@ -88,21 +88,58 @@ fn distribute_plan(
     stage_id: &mut usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let d_cfg = DistributedConfig::from_config_options(cfg)?;
+    let mut children = annotated_plan.children;
+    let parent_task_count = annotated_plan.task_count.as_usize();
 
-    let children = annotated_plan.children;
-    // This is a leaf node, so we need to scale it up with the final task count.
     if children.is_empty() {
         let scaled_up = d_cfg.__private_task_estimator.scale_up_leaf_node(
             &annotated_plan.plan,
-            annotated_plan.task_count.as_usize(),
+            parent_task_count,
             cfg,
         );
         return Ok(scaled_up.unwrap_or(annotated_plan.plan));
     }
 
-    let parent_task_count = annotated_plan.task_count.as_usize();
-    let max_child_task_count = children.iter().map(|v| v.task_count.as_usize()).max();
+    // Broadcast requires different task counts for build vs probe.
+    if annotated_plan.required_network_boundary == Some(RequiredNetworkBoundary::Broadcast) {
+        let mut build = children.remove(0);
+        let mut probe = children.remove(0);
+
+        set_task_count_until_boundary(&mut probe, parent_task_count);
+
+        // If there's only one consumer task, use Coalesce instead of Broadcast.
+        let build_child: Arc<dyn ExecutionPlan> = if parent_task_count == 1 {
+            set_task_count_until_boundary(&mut build, 1);
+            let build_side = distribute_plan(build, cfg, query_id, stage_id)?;
+            Arc::new(NetworkCoalesceExec::try_new(
+                build_side, query_id, *stage_id, 1, 1,
+            )?)
+        } else {
+            // Remove CoalescePartitionsExec since want multiple partitions flowing through
+            // BroadcastExec. Coalescing happens on consumer side.
+            let build_without_coalesce = unwrap_coalesce_partitions(build);
+            let build_task_count = build_without_coalesce.task_count.as_usize();
+            let build_side = distribute_plan(build_without_coalesce, cfg, query_id, stage_id)?;
+            let broadcast_exec = Arc::new(BroadcastExec::new(build_side, parent_task_count));
+
+            let network_broadcast = Arc::new(NetworkBroadcastExec::try_new(
+                broadcast_exec,
+                query_id,
+                *stage_id,
+                build_task_count,
+            )?);
+            // Add CoalescePartitionsExec above the network boundary on consumer side.
+            Arc::new(CoalescePartitionsExec::new(network_broadcast))
+        };
+        stage_id.add_assign(1);
+
+        let probe_side = distribute_plan(probe, cfg, query_id, stage_id)?;
+        return annotated_plan
+            .plan
+            .with_new_children(vec![build_child, probe_side]);
+    }
 
+    let max_child_task_count = children.iter().map(|v| v.task_count.as_usize()).max();
     let new_children = children
         .into_iter()
         .map(|child| distribute_plan(child, cfg, query_id, stage_id))
@@ -119,39 +156,60 @@ fn distribute_plan(
         return annotated_plan.plan.with_new_children(new_children);
     }
 
-    // If the current node has a RepartitionExec below, it needs a shuffle, so put one
-    // NetworkShuffleExec boundary in between the RepartitionExec and the current node.
-    if nb_req == RequiredNetworkBoundary::Shuffle {
-        let new_child = Arc::new(NetworkShuffleExec::try_new(
-            require_one_child(new_children)?,
-            query_id,
-            *stage_id,
-            parent_task_count,
-            max_child_task_count.unwrap_or(1),
-        )?);
-        stage_id.add_assign(1);
-        return annotated_plan.plan.with_new_children(vec![new_child]);
+    match nb_req {
+        // If the current node has a RepartitionExec below, it needs a shuffle, so put one
+        // NetworkShuffleExec boundary in between the RepartitionExec and the current node.
+        RequiredNetworkBoundary::Shuffle => {
+            let new_child = Arc::new(NetworkShuffleExec::try_new(
+                require_one_child(new_children)?,
+                query_id,
+                *stage_id,
+                parent_task_count,
+                max_child_task_count.unwrap_or(1),
+            )?);
+            stage_id.add_assign(1);
+            annotated_plan.plan.with_new_children(vec![new_child])
+        }
+        // If this is a CoalescePartitionsExec or a SortMergePreservingExec, it means that the original
+        // plan is trying to merge all partitions into one. We need to go one step ahead and also merge
+        // all distributed tasks into one.
+        RequiredNetworkBoundary::Coalesce => {
+            let new_child = Arc::new(NetworkCoalesceExec::try_new(
+                require_one_child(new_children)?,
+                query_id,
+                *stage_id,
+                parent_task_count,
+                max_child_task_count.unwrap_or(1),
+            )?);
+            stage_id.add_assign(1);
+            annotated_plan.plan.with_new_children(vec![new_child])
+        }
+        RequiredNetworkBoundary::Broadcast => unreachable!("handled above"),
     }
+}
 
-    // If this is a CoalescePartitionsExec or a SortMergePreservingExec, it means that the original
-    // plan is trying to merge all partitions into one. We need to go one step ahead and also merge
-    // all distributed tasks into one.
-    if nb_req == RequiredNetworkBoundary::Coalesce {
-        let new_child = Arc::new(NetworkCoalesceExec::try_new(
-            require_one_child(new_children)?,
-            query_id,
-            *stage_id,
-            parent_task_count,
-            max_child_task_count.unwrap_or(1),
-        )?);
-        stage_id.add_assign(1);
-        return annotated_plan.plan.with_new_children(vec![new_child]);
+fn set_task_count_until_boundary(plan: &mut AnnotatedPlan, task_count: usize) {
+    plan.task_count = TaskCountAnnotation::Desired(task_count);
+    if plan.required_network_boundary.is_none() {
+        for child in &mut plan.children {
+            set_task_count_until_boundary(child, task_count);
+        }
     }
+}
 
-    internal_err!(
-        "Unreachable code reached in distribute_plan. Could not determine how to place a network boundary below {}",
-        annotated_plan.plan.name()
-    )
+/// Unwraps [CoalescePartitionsExec] if present and returns its child.
+fn unwrap_coalesce_partitions(mut plan: AnnotatedPlan) -> AnnotatedPlan {
+    if plan
+        .plan
+        .as_any()
+        .downcast_ref::<CoalescePartitionsExec>()
+        .is_some()
+        && !plan.children.is_empty()
+    {
+        plan.children.remove(0)
+    } else {
+        plan
+    }
 }
 
 /// Rearranges the [CoalesceBatchesExec] nodes in the plan so that they are placed right below
@@ -426,11 +484,23 @@ mod tests {
         })
         .await;
         assert_snapshot!(plan, @r"
-        CoalesceBatchesExec: target_batch_size=8192
-          HashJoinExec: mode=CollectLeft, join_type=Left, on=[(RainToday@1, RainToday@1)], projection=[MinTemp@0, MaxTemp@2]
-            CoalescePartitionsExec
-              DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MinTemp, RainToday], file_type=parquet
-            DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MaxTemp, RainToday], file_type=parquet
+        ┌───── DistributedExec ── Tasks: t0:[p0] 
+        │ CoalescePartitionsExec
+        │   [Stage 2] => NetworkCoalesceExec: output_partitions=3, input_tasks=3
+        └──────────────────────────────────────────────────
+          ┌───── Stage 2 ── Tasks: t0:[p0] t1:[p1] t2:[p2] 
+          │ CoalesceBatchesExec: target_batch_size=8192
+          │   HashJoinExec: mode=CollectLeft, join_type=Left, on=[(RainToday@1, RainToday@1)], projection=[MinTemp@0, MaxTemp@2]
+          │     CoalescePartitionsExec
+          │       [Stage 1] => NetworkBroadcastExec: partitions_per_consumer=1, stage_partitions=3, input_tasks=3
+          │     PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+          │       DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MaxTemp, RainToday], file_type=parquet
+          └──────────────────────────────────────────────────
+            ┌───── Stage 1 ── Tasks: t0:[p0..p2] t1:[p3..p5] t2:[p6..p8] 
+            │ BroadcastExec: input_partitions=1, consumer_tasks=3, output_partitions=3
+            │   PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │     DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MinTemp, RainToday], file_type=parquet
+            └──────────────────────────────────────────────────
         ");
     }
 
@@ -468,28 +538,31 @@ mod tests {
         │ CoalescePartitionsExec
         │   CoalesceBatchesExec: target_batch_size=8192
         │     HashJoinExec: mode=CollectLeft, join_type=Left, on=[(RainTomorrow@1, RainTomorrow@1)], projection=[MinTemp@0, MaxTemp@2]
-        │       CoalescePartitionsExec
-        │         [Stage 2] => NetworkCoalesceExec: output_partitions=8, input_tasks=2
+        │       [Stage 3] => NetworkCoalesceExec: output_partitions=1, input_tasks=1
         │       ProjectionExec: expr=[avg(weather.MaxTemp)@1 as MaxTemp, RainTomorrow@0 as RainTomorrow]
         │         AggregateExec: mode=FinalPartitioned, gby=[RainTomorrow@0 as RainTomorrow], aggr=[avg(weather.MaxTemp)]
-        │           [Stage 3] => NetworkShuffleExec: output_partitions=4, input_tasks=3
+        │           [Stage 4] => NetworkShuffleExec: output_partitions=4, input_tasks=3
         └──────────────────────────────────────────────────
-          ┌───── Stage 2 ── Tasks: t0:[p0..p3] t1:[p0..p3] 
-          │ ProjectionExec: expr=[avg(weather.MinTemp)@1 as MinTemp, RainTomorrow@0 as RainTomorrow]
-          │   AggregateExec: mode=FinalPartitioned, gby=[RainTomorrow@0 as RainTomorrow], aggr=[avg(weather.MinTemp)]
-          │     [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
+          ┌───── Stage 3 ── Tasks: t0:[p0] 
+          │ CoalescePartitionsExec
+          │   [Stage 2] => NetworkCoalesceExec: output_partitions=8, input_tasks=2
           └──────────────────────────────────────────────────
-            ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] t2:[p0..p7] 
-            │ CoalesceBatchesExec: target_batch_size=8192
-            │   RepartitionExec: partitioning=Hash([RainTomorrow@0], 8), input_partitions=4
-            │     AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MinTemp)]
-            │       CoalesceBatchesExec: target_batch_size=8192
-            │         FilterExec: RainToday@1 = yes, projection=[MinTemp@0, RainTomorrow@2]
-            │           RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-            │             PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │               DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MinTemp, RainToday, RainTomorrow], file_type=parquet, predicate=RainToday@1 = yes, pruning_predicate=RainToday_null_count@2 != row_count@3 AND RainToday_min@0 <= yes AND yes <= RainToday_max@1, required_guarantees=[RainToday in (yes)]
+            ┌───── Stage 2 ── Tasks: t0:[p0..p3] t1:[p0..p3] 
+            │ ProjectionExec: expr=[avg(weather.MinTemp)@1 as MinTemp, RainTomorrow@0 as RainTomorrow]
+            │   AggregateExec: mode=FinalPartitioned, gby=[RainTomorrow@0 as RainTomorrow], aggr=[avg(weather.MinTemp)]
+            │     [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
             └──────────────────────────────────────────────────
-          ┌───── Stage 3 ── Tasks: t0:[p0..p3] t1:[p0..p3] t2:[p0..p3] 
+              ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] t2:[p0..p7] 
+              │ CoalesceBatchesExec: target_batch_size=8192
+              │   RepartitionExec: partitioning=Hash([RainTomorrow@0], 8), input_partitions=4
+              │     AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MinTemp)]
+              │       CoalesceBatchesExec: target_batch_size=8192
+              │         FilterExec: RainToday@1 = yes, projection=[MinTemp@0, RainTomorrow@2]
+              │           RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+              │             PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+              │               DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MinTemp, RainToday, RainTomorrow], file_type=parquet, predicate=RainToday@1 = yes, pruning_predicate=RainToday_null_count@2 != row_count@3 AND RainToday_min@0 <= yes AND yes <= RainToday_max@1, required_guarantees=[RainToday in (yes)]
+              └──────────────────────────────────────────────────
+          ┌───── Stage 4 ── Tasks: t0:[p0..p3] t1:[p0..p3] t2:[p0..p3] 
           │ CoalesceBatchesExec: target_batch_size=8192
           │   RepartitionExec: partitioning=Hash([RainTomorrow@0], 4), input_partitions=4
           │     AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MaxTemp)]

diff --git a/src/distributed_planner/network_boundary.rs b/src/distributed_planner/network_boundary.rs
@@ -1,4 +1,4 @@
-use crate::{NetworkCoalesceExec, NetworkShuffleExec, Stage};
+use crate::{NetworkBroadcastExec, NetworkCoalesceExec, NetworkShuffleExec, Stage};
 use datafusion::physical_plan::ExecutionPlan;
 use std::sync::Arc;
 
@@ -35,6 +35,8 @@ impl NetworkBoundaryExt for dyn ExecutionPlan {
             Some(node)
         } else if let Some(node) = self.as_any().downcast_ref::<NetworkCoalesceExec>() {
             Some(node)
+        } else if let Some(node) = self.as_any().downcast_ref::<NetworkBroadcastExec>() {
+            Some(node)
         } else {
             None
         }

diff --git a/src/distributed_planner/plan_annotator.rs b/src/distributed_planner/plan_annotator.rs
@@ -19,6 +19,7 @@ use std::sync::Arc;
 pub(super) enum RequiredNetworkBoundary {
     Shuffle,
     Coalesce,
+    Broadcast,
 }
 
 /// Wraps an [ExecutionPlan] and annotates it with information about how many distributed tasks
@@ -182,13 +183,6 @@ fn _annotate_plan(
             count += annotated_child.task_count.as_usize();
         }
         task_count = Desired(count);
-    } else if let Some(node) = plan.as_any().downcast_ref::<HashJoinExec>()
-        && node.mode == PartitionMode::CollectLeft
-    {
-        // We cannot distribute CollectLeft HashJoinExec nodes yet. Once
-        // https://github.com/datafusion-contrib/datafusion-distributed/pull/229 lands,
-        // we can remove this check.
-        task_count = Maximum(1);
     } else {
         // The task count for this plan is decided by the biggest task count from the children; unless
         // a child specifies a maximum task count, in that case, the maximum is respected. Some
@@ -335,6 +329,12 @@ fn required_network_boundary_below(parent: &dyn ExecutionPlan) -> Option<Require
         return Some(RequiredNetworkBoundary::Coalesce);
     }
 
+    if let Some(hash_join) = parent.as_any().downcast_ref::<HashJoinExec>() {
+        if hash_join.partition_mode() == &PartitionMode::CollectLeft {
+            return Some(RequiredNetworkBoundary::Broadcast);
+        }
+    }
+
     None
 }
 
@@ -410,11 +410,11 @@ mod tests {
         "#;
         let annotated = sql_to_annotated(query).await;
         assert_snapshot!(annotated, @r"
-        CoalesceBatchesExec: task_count=Maximum(1)
-          HashJoinExec: task_count=Maximum(1)
-            CoalescePartitionsExec: task_count=Maximum(1)
-              DataSourceExec: task_count=Maximum(1)
-            DataSourceExec: task_count=Maximum(1)
+        CoalesceBatchesExec: task_count=Desired(3)
+          HashJoinExec: task_count=Desired(3), required_network_boundary=Broadcast
+            CoalescePartitionsExec: task_count=Desired(3)
+              DataSourceExec: task_count=Desired(3)
+            DataSourceExec: task_count=Desired(3)
         ")
     }
 
@@ -445,8 +445,8 @@ mod tests {
         "#;
         let annotated = sql_to_annotated(query).await;
         assert_snapshot!(annotated, @r"
-        CoalesceBatchesExec: task_count=Maximum(1)
-          HashJoinExec: task_count=Maximum(1)
+        CoalesceBatchesExec: task_count=Desired(1)
+          HashJoinExec: task_count=Desired(1), required_network_boundary=Broadcast
             CoalescePartitionsExec: task_count=Maximum(1), required_network_boundary=Coalesce
               ProjectionExec: task_count=Desired(2)
                 AggregateExec: task_count=Desired(2)
@@ -476,11 +476,11 @@ mod tests {
         "#;
         let annotated = sql_to_annotated(query).await;
         assert_snapshot!(annotated, @r"
-        CoalesceBatchesExec: task_count=Maximum(1)
-          HashJoinExec: task_count=Maximum(1)
-            CoalescePartitionsExec: task_count=Maximum(1)
-              DataSourceExec: task_count=Maximum(1)
-            DataSourceExec: task_count=Maximum(1)
+        CoalesceBatchesExec: task_count=Desired(3)
+          HashJoinExec: task_count=Desired(3), required_network_boundary=Broadcast
+            CoalescePartitionsExec: task_count=Desired(3)
+              DataSourceExec: task_count=Desired(3)
+            DataSourceExec: task_count=Desired(3)
         ")
     }