datafusion-contrib · gabotechs · Oct 9, 2025 · Oct 4, 2025 · Oct 5, 2025 · Oct 9, 2025
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -47,7 +47,7 @@ use datafusion_distributed::test_utils::localhost::{
 };
 use datafusion_distributed::{
     DistributedExt, DistributedPhysicalOptimizerRule, DistributedSessionBuilder,
-    DistributedSessionBuilderContext, StageExec,
+    DistributedSessionBuilderContext, NetworkBoundaryExt, Stage,
 };
 use log::info;
 use std::fs;
@@ -331,8 +331,8 @@ impl RunOpt {
         }
         let mut n_tasks = 0;
         physical_plan.clone().transform_down(|node| {
-            if let Some(node) = node.as_any().downcast_ref::<StageExec>() {
-                n_tasks += node.tasks.len()
+            if let Some(node) = node.as_network_boundary() {
+                n_tasks += node.input_stage().map(|v| v.tasks.len()).unwrap_or(0)
             }
             Ok(Transformed::no(node))
         })?;

diff --git a/src/common/execution_plan_ops.rs b/src/common/execution_plan_ops.rs
@@ -0,0 +1,13 @@
+use datafusion::common::plan_err;
+use datafusion::error::DataFusionError;
+use datafusion::physical_plan::ExecutionPlan;
+use std::sync::Arc;
+
+pub fn one_child(
+    children: &[Arc<dyn ExecutionPlan>],
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    if children.len() != 1 {
+        return plan_err!("Expected exactly 1 children, got {}", children.len());
+    }
+    Ok(children[0].clone())
+}
diff --git a/src/common/mod.rs b/src/common/mod.rs
@@ -1,7 +1,9 @@
 mod callback_stream;
+mod execution_plan_ops;
 mod partitioning;
 #[allow(unused)]
 pub mod ttl_map;
 
 pub(crate) use callback_stream::with_callback;
+pub(crate) use execution_plan_ops::*;
 pub(crate) use partitioning::{scale_partitioning, scale_partitioning_props};
diff --git a/src/distributed_physical_optimizer_rule.rs b/src/distributed_physical_optimizer_rule.rs
diff --git a/src/execution_plans/distributed.rs b/src/execution_plans/distributed.rs
@@ -0,0 +1,126 @@
+use crate::channel_resolver_ext::get_distributed_channel_resolver;
+use crate::common::one_child;
+use crate::distributed_physical_optimizer_rule::NetworkBoundaryExt;
+use crate::protobuf::DistributedCodec;
+use crate::{ExecutionTask, Stage};
+use datafusion::common::exec_err;
+use datafusion::common::tree_node::{Transformed, TreeNode};
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
+use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+use rand::Rng;
+use std::any::Any;
+use std::fmt::Formatter;
+use std::sync::Arc;
+use url::Url;
+
+/// [ExecutionPlan] that executes the inner plan in distributed mode.
+/// Before executing it, two modifications are lazily performed on the plan:
+/// 1. Assigns worker URLs to all the stages. A random set of URLs are sampled from the
+///    channel resolver and assigned to each task in each stage.
+/// 2. Encodes all the plans in protobuf format so that network boundary nodes can send them
+///    over the wire.
+#[derive(Debug, Clone)]
+pub struct DistributedExec {
+    pub plan: Arc<dyn ExecutionPlan>,
+}
+
+impl DistributedExec {
+    pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self { plan }
+    }
+
+    fn prepare_plan(
+        &self,
+        urls: &[Url],
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> datafusion::common::Result<Arc<dyn ExecutionPlan>> {
+        let prepared = Arc::clone(&self.plan).transform_up(|plan| {
+            let Some(plan) = plan.as_network_boundary() else {
+                return Ok(Transformed::no(plan));
+            };
+
+            let mut rng = rand::thread_rng();
+            let start_idx = rng.gen_range(0..urls.len());
+
+            let Some(stage) = plan.input_stage() else {
+                return exec_err!(
+                    "NetworkBoundary '{}' has not been assigned a stage",
+                    plan.name()
+                );
+            };
+
+            let ready_stage = Stage {
+                query_id: stage.query_id,
+                num: stage.num,
+                plan: stage.plan.to_encoded(codec)?,
+                tasks: stage
+                    .tasks
+                    .iter()
+                    .enumerate()
+                    .map(|(i, _)| ExecutionTask {
+                        url: Some(urls[(start_idx + i) % urls.len()].clone()),
+                    })
+                    .collect::<Vec<_>>(),
+            };
+
+            Ok(Transformed::yes(plan.with_input_stage(ready_stage)?))
+        })?;
+        Ok(prepared.data)
+    }
+}
+
+impl DisplayAs for DistributedExec {
+    fn fmt_as(&self, _: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        write!(f, "DistributedExec")
+    }
+}
+
+impl ExecutionPlan for DistributedExec {
+    fn name(&self) -> &str {
+        "DistributedExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        self.plan.properties()
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.plan]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::common::Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(DistributedExec {
+            plan: one_child(&children)?,
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> datafusion::common::Result<SendableRecordBatchStream> {
+        if partition > 0 {
+            // The DistributedExec node calls try_assign_urls() lazily upon calling .execute(). This means
+            // that .execute() must only be called once, as we cannot afford to perform several
+            // random URL assignation while calling multiple partitions, as they will differ,
+            // producing an invalid plan
+            return exec_err!(
+                "DistributedExec must only have 1 partition, but it was called with partition index {partition}"
+            );
+        }
+
+        let channel_resolver = get_distributed_channel_resolver(context.session_config())?;
+        let codec = DistributedCodec::new_combined_with_user(context.session_config());
+
+        let plan = self.prepare_plan(&channel_resolver.get_urls()?, &codec)?;
+        plan.execute(partition, context)
+    }
+}
diff --git a/src/execution_plans/mod.rs b/src/execution_plans/mod.rs
@@ -1,13 +1,11 @@
+mod distributed;
 mod metrics;
 mod network_coalesce;
 mod network_shuffle;
 mod partition_isolator;
-mod stage;
 
+pub use distributed::DistributedExec;
 pub use metrics::MetricsWrapperExec;
 pub use network_coalesce::{NetworkCoalesceExec, NetworkCoalesceReady};
 pub use network_shuffle::{NetworkShuffleExec, NetworkShuffleReadyExec};
 pub use partition_isolator::PartitionIsolatorExec;
-pub(crate) use stage::InputStage;
-pub use stage::display_plan_graphviz;
-pub use stage::{DistributedTaskContext, ExecutionTask, StageExec};