Misc improvements to public API

gabotechs · gabotechs · commit c31844bd6d50 · 2025-10-09T11:51:26.000+02:00
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -47,7 +47,7 @@ use datafusion_distributed::test_utils::localhost::{
 };
 use datafusion_distributed::{
     DistributedExt, DistributedPhysicalOptimizerRule, DistributedSessionBuilder,
-    DistributedSessionBuilderContext, NetworkBoundaryExt, Stage,
+    DistributedSessionBuilderContext, NetworkBoundaryExt,
 };
 use log::info;
 use std::fs;
diff --git a/src/distributed_physical_optimizer_rule.rs b/src/distributed_physical_optimizer_rule.rs
@@ -1,5 +1,6 @@
-use super::{NetworkShuffleExec, PartitionIsolatorExec, Stage};
+use super::{NetworkShuffleExec, PartitionIsolatorExec};
 use crate::execution_plans::{DistributedExec, NetworkCoalesceExec};
+use crate::stage::Stage;
 use datafusion::common::plan_err;
 use datafusion::common::tree_node::TreeNodeRecursion;
 use datafusion::datasource::source::DataSourceExec;
@@ -232,19 +233,19 @@ impl DistributedPhysicalOptimizerRule {
             };
 
             let stage = loop {
-                let (inner_plan, in_tasks) = dnode.as_ref().to_stage_info(n_tasks)?;
+                let input_stage_info = dnode.as_ref().get_input_stage_info(n_tasks)?;
                 // If the current stage has just 1 task, and the next stage is only going to have
                 // 1 task, there's no point in having a network boundary in between, they can just
                 // communicate in memory.
-                if n_tasks == 1 && in_tasks == 1 {
+                if n_tasks == 1 && input_stage_info.task_count == 1 {
                     let mut n = dnode.as_ref().rollback()?;
                     if let Some(node) = n.as_any().downcast_ref::<PartitionIsolatorExec>() {
                         // Also trim PartitionIsolatorExec out of the plan.
                         n = Arc::clone(node.children().first().unwrap());
                     }
                     return Ok(Transformed::yes(n));
                 }
-                match Self::_distribute_plan_inner(query_id, inner_plan.clone(), num, depth + 1, in_tasks) {
+                match Self::_distribute_plan_inner(query_id, input_stage_info.plan, num, depth + 1, input_stage_info.task_count) {
                     Ok(v) => break v,
                     Err(e) => match get_distribute_plan_err(&e) {
                         None => return Err(e),
@@ -253,7 +254,7 @@ impl DistributedPhysicalOptimizerRule {
                             // that no more than `limit` tasks can be used for it, so we are going
                             // to limit the amount of tasks to the requested number and try building
                             // the stage again.
-                            if in_tasks == *limit {
+                            if input_stage_info.task_count == *limit {
                                 return plan_err!("A node requested {limit} tasks for the stage its in, but that stage already has that many tasks");
                             }
                             dnode = Referenced::Arced(dnode.as_ref().with_input_task_count(*limit)?);
@@ -278,14 +279,27 @@ impl DistributedPhysicalOptimizerRule {
     }
 }
 
+/// Necessary information for building a [Stage] during distributed planning.
+///
+/// [NetworkBoundary]s return this piece of data so that the distributed planner know how to
+/// build the next [Stage] from which the [NetworkBoundary] is going to receive data.
+///
+/// Some network boundaries might perform some modifications in their children, like scaling
+/// up the number of partitions, or injecting a specific [ExecutionPlan] on top.
+pub struct InputStageInfo {
+    /// The head plan of the [Stage] that is about to be built.
+    pub plan: Arc<dyn ExecutionPlan>,
+    /// The amount of tasks the [Stage] will have.
+    pub task_count: usize,
+}
+
 /// This trait represents a node that introduces the necessity of a network boundary in the plan.
 /// The distributed planner, upon stepping into one of these, will break the plan and build a stage
 /// out of it.
 pub trait NetworkBoundary: ExecutionPlan {
-    /// Returns the information necessary for building the next stage.
-    /// - The head node of the stage.
-    /// - the amount of tasks that stage will have.
-    fn to_stage_info(&self, n_tasks: usize) -> Result<(Arc<dyn ExecutionPlan>, usize)>;
+    /// Returns the information necessary for building the next stage from which this
+    /// [NetworkBoundary] is going to collect data.
+    fn get_input_stage_info(&self, task_count: usize) -> Result<InputStageInfo>;
 
     /// re-assigns a different number of input tasks to the current [NetworkBoundary].
     ///
@@ -295,6 +309,8 @@ pub trait NetworkBoundary: ExecutionPlan {
 
     /// Called when a [Stage] is correctly formed. The [NetworkBoundary] can use this
     /// information to perform any internal transformations necessary for distributed execution.
+    ///
+    /// Typically, [NetworkBoundary]s will use this call for transitioning from "Pending" to "ready".
     fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn ExecutionPlan>>;
 
     /// Returns the assigned input [Stage], if any.
diff --git a/src/execution_plans/distributed.rs b/src/execution_plans/distributed.rs
@@ -2,7 +2,7 @@ use crate::channel_resolver_ext::get_distributed_channel_resolver;
 use crate::distributed_physical_optimizer_rule::NetworkBoundaryExt;
 use crate::execution_plans::common::require_one_child;
 use crate::protobuf::DistributedCodec;
-use crate::{ExecutionTask, Stage};
+use crate::stage::{ExecutionTask, Stage};
 use datafusion::common::exec_err;
 use datafusion::common::tree_node::{Transformed, TreeNode};
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
diff --git a/src/execution_plans/network_coalesce.rs b/src/execution_plans/network_coalesce.rs
@@ -1,13 +1,15 @@
 use crate::channel_resolver_ext::get_distributed_channel_resolver;
 use crate::config_extension_ext::ContextGrpcMetadata;
-use crate::distributed_physical_optimizer_rule::{NetworkBoundary, limit_tasks_err};
+use crate::distributed_physical_optimizer_rule::{
+    InputStageInfo, NetworkBoundary, limit_tasks_err,
+};
 use crate::execution_plans::common::{require_one_child, scale_partitioning_props};
 use crate::flight_service::DoGet;
 use crate::metrics::MetricsCollectingStream;
 use crate::metrics::proto::MetricsSetProto;
 use crate::protobuf::{StageKey, map_flight_to_datafusion_error, map_status_to_datafusion_error};
-use crate::stage::MaybeEncodedPlan;
-use crate::{ChannelResolver, DistributedTaskContext, Stage};
+use crate::stage::{MaybeEncodedPlan, Stage};
+use crate::{ChannelResolver, DistributedTaskContext};
 use arrow_flight::Ticket;
 use arrow_flight::decode::FlightRecordBatchStream;
 use arrow_flight::error::FlightError;
@@ -115,10 +117,7 @@ impl NetworkCoalesceExec {
 }
 
 impl NetworkBoundary for NetworkCoalesceExec {
-    fn to_stage_info(
-        &self,
-        n_tasks: usize,
-    ) -> Result<(Arc<dyn ExecutionPlan>, usize), DataFusionError> {
+    fn get_input_stage_info(&self, n_tasks: usize) -> Result<InputStageInfo, DataFusionError> {
         let Self::Pending(pending) = self else {
             return plan_err!("can only return wrapped child if on Pending state");
         };
@@ -128,7 +127,10 @@ impl NetworkBoundary for NetworkCoalesceExec {
             return Err(limit_tasks_err(1));
         }
 
-        Ok((Arc::clone(&pending.input), pending.input_tasks))
+        Ok(InputStageInfo {
+            plan: Arc::clone(&pending.input),
+            task_count: pending.input_tasks,
+        })
     }
 
     fn with_input_stage(
diff --git a/src/execution_plans/network_shuffle.rs b/src/execution_plans/network_shuffle.rs
@@ -1,14 +1,14 @@
 use crate::channel_resolver_ext::get_distributed_channel_resolver;
 use crate::config_extension_ext::ContextGrpcMetadata;
-use crate::distributed_physical_optimizer_rule::NetworkBoundary;
+use crate::distributed_physical_optimizer_rule::{InputStageInfo, NetworkBoundary};
 use crate::execution_plans::common::{require_one_child, scale_partitioning};
 use crate::flight_service::DoGet;
 use crate::metrics::MetricsCollectingStream;
 use crate::metrics::proto::MetricsSetProto;
 use crate::protobuf::StageKey;
 use crate::protobuf::{map_flight_to_datafusion_error, map_status_to_datafusion_error};
-use crate::stage::MaybeEncodedPlan;
-use crate::{ChannelResolver, DistributedTaskContext, Stage};
+use crate::stage::{MaybeEncodedPlan, Stage};
+use crate::{ChannelResolver, DistributedTaskContext};
 use arrow_flight::Ticket;
 use arrow_flight::decode::FlightRecordBatchStream;
 use arrow_flight::error::FlightError;
@@ -168,10 +168,7 @@ impl NetworkShuffleExec {
 }
 
 impl NetworkBoundary for NetworkShuffleExec {
-    fn to_stage_info(
-        &self,
-        n_tasks: usize,
-    ) -> Result<(Arc<dyn ExecutionPlan>, usize), DataFusionError> {
+    fn get_input_stage_info(&self, n_tasks: usize) -> Result<InputStageInfo, DataFusionError> {
         let Self::Pending(pending) = self else {
             return plan_err!("cannot only return wrapped child if on Pending state");
         };
@@ -186,7 +183,10 @@ impl NetworkBoundary for NetworkShuffleExec {
             scale_partitioning(r_exe.partitioning(), |p| p * n_tasks),
         )?);
 
-        Ok((next_stage_plan, pending.input_tasks))
+        Ok(InputStageInfo {
+            plan: next_stage_plan,
+            task_count: pending.input_tasks,
+        })
     }
 
     fn with_input_task_count(
diff --git a/src/flight_service/do_get.rs b/src/flight_service/do_get.rs
@@ -209,8 +209,8 @@ fn collect_and_create_metrics_flight_data(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::ExecutionTask;
     use crate::flight_service::session_builder::DefaultSessionBuilder;
+    use crate::stage::ExecutionTask;
     use arrow::datatypes::{Schema, SchemaRef};
     use arrow_flight::Ticket;
     use datafusion::physical_expr::Partitioning;
diff --git a/src/lib.rs b/src/lib.rs
@@ -17,7 +17,7 @@ pub mod test_utils;
 pub use channel_resolver_ext::{BoxCloneSyncChannel, ChannelResolver};
 pub use distributed_ext::DistributedExt;
 pub use distributed_physical_optimizer_rule::{
-    DistributedPhysicalOptimizerRule, NetworkBoundaryExt,
+    DistributedPhysicalOptimizerRule, InputStageInfo, NetworkBoundary, NetworkBoundaryExt,
 };
 pub use execution_plans::{
     DistributedExec, NetworkCoalesceExec, NetworkShuffleExec, PartitionIsolatorExec,
diff --git a/src/protobuf/distributed_codec.rs b/src/protobuf/distributed_codec.rs
@@ -1,8 +1,8 @@
 use super::get_distributed_user_codecs;
 use crate::distributed_physical_optimizer_rule::NetworkBoundary;
 use crate::execution_plans::{NetworkCoalesceExec, NetworkCoalesceReady, NetworkShuffleReadyExec};
-use crate::stage::MaybeEncodedPlan;
-use crate::{ExecutionTask, NetworkShuffleExec, PartitionIsolatorExec, Stage};
+use crate::stage::{ExecutionTask, MaybeEncodedPlan, Stage};
+use crate::{NetworkShuffleExec, PartitionIsolatorExec};
 use bytes::Bytes;
 use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::datatypes::SchemaRef;
diff --git a/src/stage.rs b/src/stage.rs
@@ -72,11 +72,11 @@ use uuid::Uuid;
 #[derive(Debug, Clone)]
 pub struct Stage {
     /// Our query_id
-    pub query_id: Uuid,
+    pub(crate) query_id: Uuid,
     /// Our stage number
-    pub num: usize,
+    pub(crate) num: usize,
     /// The physical execution plan that this stage will execute.
-    pub plan: MaybeEncodedPlan,
+    pub(crate) plan: MaybeEncodedPlan,
     /// Our tasks which tell us how finely grained to execute the partitions in
     /// the plan
     pub tasks: Vec<ExecutionTask>,
@@ -86,15 +86,15 @@ pub struct Stage {
 pub struct ExecutionTask {
     /// The url of the worker that will execute this task.  A None value is interpreted as
     /// unassigned.
-    pub url: Option<Url>,
+    pub(crate) url: Option<Url>,
 }
 
 /// An [ExecutionPlan] that can be either:
 /// - Decoded: the inner [ExecutionPlan] is stored as-is.
 /// - Encoded: the inner [ExecutionPlan] is stored as protobuf [Bytes]. Storing it this way allow us
 ///   to thread it through the project and eventually send it through gRPC in a zero copy manner.
 #[derive(Debug, Clone)]
-pub enum MaybeEncodedPlan {
+pub(crate) enum MaybeEncodedPlan {
     /// The decoded [ExecutionPlan].
     Decoded(Arc<dyn ExecutionPlan>),
     /// A protobuf encoded version of the [ExecutionPlan]. The inner [Bytes] represent the full
@@ -106,7 +106,7 @@ pub enum MaybeEncodedPlan {
 }
 
 impl MaybeEncodedPlan {
-    pub fn to_encoded(&self, codec: &dyn PhysicalExtensionCodec) -> Result<Self> {
+    pub(crate) fn to_encoded(&self, codec: &dyn PhysicalExtensionCodec) -> Result<Self> {
         Ok(match self {
             Self::Decoded(plan) => Self::Encoded(
                 PhysicalPlanNode::try_from_physical_plan(Arc::clone(plan), codec)?
@@ -117,14 +117,14 @@ impl MaybeEncodedPlan {
         })
     }
 
-    pub fn decoded(&self) -> Result<&Arc<dyn ExecutionPlan>> {
+    pub(crate) fn decoded(&self) -> Result<&Arc<dyn ExecutionPlan>> {
         match self {
             MaybeEncodedPlan::Decoded(v) => Ok(v),
             MaybeEncodedPlan::Encoded(_) => plan_err!("Expected plan to be in a decoded state"),
         }
     }
 
-    pub fn encoded(&self) -> Result<&Bytes> {
+    pub(crate) fn encoded(&self) -> Result<&Bytes> {
         match self {
             MaybeEncodedPlan::Decoded(_) => plan_err!("Expected plan to be in a encoded state"),
             MaybeEncodedPlan::Encoded(v) => Ok(v),
diff --git a/src/test_utils/plans.rs b/src/test_utils/plans.rs
@@ -7,7 +7,8 @@ use std::sync::Arc;
 
 use crate::distributed_physical_optimizer_rule::NetworkBoundaryExt;
 use crate::execution_plans::DistributedExec;
-use crate::{Stage, protobuf::StageKey};
+use crate::protobuf::StageKey;
+use crate::stage::Stage;
 
 /// count_plan_nodes counts the number of execution plan nodes in a plan using BFS traversal.
 /// This does NOT traverse child stages, only the execution plan tree within this stage.