datafusion-contrib
diff --git a/‎src/distributed_planner/distributed_config.rs‎
Lines changed: 7 additions & 0 deletions b/‎src/distributed_planner/distributed_config.rs‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/distributed_planner/distributed_physical_optimizer_rule.rs‎
Lines changed: 54 additions & 39 deletions b/‎src/distributed_planner/distributed_physical_optimizer_rule.rs‎
Lines changed: 54 additions & 39 deletions
diff --git a/‎src/execution_plans/network_shuffle.rs‎
Lines changed: 22 additions & 10 deletions b/‎src/execution_plans/network_shuffle.rs‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎src/test_utils/insta.rs‎
Lines changed: 1 addition & 0 deletions b/‎src/test_utils/insta.rs‎
Lines changed: 1 addition & 0 deletions
@@ -24,6 +24,13 @@ extensions_options! {
         /// - If a node reduces the cardinality of the data, this factor will decrease.
         /// - In any other situation, this factor is left intact.
         pub cardinality_task_count_factor: f64, default = cardinality_task_count_factor_default()
+        /// Upon shuffling over the network, data streams need to be disassembled in a lot of output
+        /// partitions, which means the resulting streams might contain a lot of tiny record batches
+        /// to be sent over the wire. This parameter controls the batch size in number of rows for
+        /// the CoalesceBatchExec operator that is placed at the top of the stage for sending bigger
+        /// batches over the wire.
+        /// If set to 0, batch coalescing is disabled on network shuffle operations.
+        pub shuffle_batch_size: usize, default = 8192
         /// Collection of [TaskEstimator]s that will be applied to leaf nodes in order to
         /// estimate how many tasks should be spawned for the [Stage] containing the leaf node.
         pub(crate) __private_task_estimator: CombinedTaskEstimator, default = CombinedTaskEstimator::default()
 
@@ -12,6 +12,7 @@ use datafusion::common::tree_node::TreeNodeRecursion;
 use datafusion::error::DataFusionError;
 use datafusion::physical_expr::Partitioning;
 use datafusion::physical_plan::ExecutionPlanProperties;
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::execution_plan::CardinalityEffect;
 use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
@@ -300,6 +301,13 @@ fn _apply_network_boundaries(
             return Ok(ctx);
         }
         let task_count = ctx.scale_task_count_and_swap()?;
+        // Network shuffles imply partitioning each data stream in a lot of different partitions,
+        // which means that each resulting stream might contain tiny batches. It's important to
+        // have decent sized batches here as this will ultimately be sent over the wire, and the
+        // penalty there for sending many tiny batches instead of few big ones is big.
+        if d_cfg.shuffle_batch_size > 0 {
+            ctx.plan = Arc::new(CoalesceBatchesExec::new(ctx.plan, d_cfg.shuffle_batch_size));
+        }
         ctx.plan = Arc::new(NetworkShuffleExec::try_new(ctx.plan, task_count)?);
         return Ok(ctx);
     }
@@ -552,11 +560,12 @@ mod tests {
           │         [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] t2:[p0..p7] 
-            │ RepartitionExec: partitioning=Hash([RainToday@0], 8), input_partitions=4
-            │   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-            │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-            │       PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainToday@0], 8), input_partitions=4
+            │     RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+            │       AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
+            │         PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │           DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
             └──────────────────────────────────────────────────
         ");
     }
@@ -584,11 +593,12 @@ mod tests {
           │         [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=2
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] 
-            │ RepartitionExec: partitioning=Hash([RainToday@0], 8), input_partitions=4
-            │   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-            │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-            │       PartitionIsolatorExec: t0:[p0,p1,__] t1:[__,__,p0] 
-            │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainToday@0], 8), input_partitions=4
+            │     RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+            │       AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
+            │         PartitionIsolatorExec: t0:[p0,p1,__] t1:[__,__,p0] 
+            │           DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
             └──────────────────────────────────────────────────
         ");
     }
@@ -638,11 +648,12 @@ mod tests {
         │             [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
         └──────────────────────────────────────────────────
           ┌───── Stage 1 ── Tasks: t0:[p0..p3] t1:[p0..p3] t2:[p0..p3] 
-          │ RepartitionExec: partitioning=Hash([RainToday@0], 4), input_partitions=4
-          │   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-          │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-          │       PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-          │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
+          │ CoalesceBatchesExec: target_batch_size=8192
+          │   RepartitionExec: partitioning=Hash([RainToday@0], 4), input_partitions=4
+          │     RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+          │       AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
+          │         PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+          │           DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
           └──────────────────────────────────────────────────
         ");
     }
@@ -695,11 +706,12 @@ mod tests {
           │         [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] t2:[p0..p7] 
-            │ RepartitionExec: partitioning=Hash([RainToday@0], 8), input_partitions=4
-            │   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-            │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-            │       PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainToday@0], 8), input_partitions=4
+            │     RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+            │       AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
+            │         PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │           DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
             └──────────────────────────────────────────────────
         ");
     }
@@ -770,22 +782,24 @@ mod tests {
           │       [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] t2:[p0..p7] 
-            │ RepartitionExec: partitioning=Hash([RainTomorrow@0], 8), input_partitions=4
-            │   AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MinTemp)]
-            │     CoalesceBatchesExec: target_batch_size=8192
-            │       FilterExec: RainToday@1 = yes, projection=[MinTemp@0, RainTomorrow@2]
-            │         RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-            │           PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │             DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MinTemp, RainToday, RainTomorrow], file_type=parquet, predicate=RainToday@1 = yes, pruning_predicate=RainToday_null_count@2 != row_count@3 AND RainToday_min@0 <= yes AND yes <= RainToday_max@1, required_guarantees=[RainToday in (yes)]
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainTomorrow@0], 8), input_partitions=4
+            │     AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MinTemp)]
+            │       CoalesceBatchesExec: target_batch_size=8192
+            │         FilterExec: RainToday@1 = yes, projection=[MinTemp@0, RainTomorrow@2]
+            │           RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+            │             PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │               DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MinTemp, RainToday, RainTomorrow], file_type=parquet, predicate=RainToday@1 = yes, pruning_predicate=RainToday_null_count@2 != row_count@3 AND RainToday_min@0 <= yes AND yes <= RainToday_max@1, required_guarantees=[RainToday in (yes)]
             └──────────────────────────────────────────────────
           ┌───── Stage 3 ── Tasks: t0:[p0..p3] t1:[p0..p3] t2:[p0..p3] 
-          │ RepartitionExec: partitioning=Hash([RainTomorrow@0], 4), input_partitions=4
-          │   AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MaxTemp)]
-          │     CoalesceBatchesExec: target_batch_size=8192
-          │       FilterExec: RainToday@1 = no, projection=[MaxTemp@0, RainTomorrow@2]
-          │         RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-          │           PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-          │             DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MaxTemp, RainToday, RainTomorrow], file_type=parquet, predicate=RainToday@1 = no, pruning_predicate=RainToday_null_count@2 != row_count@3 AND RainToday_min@0 <= no AND no <= RainToday_max@1, required_guarantees=[RainToday in (no)]
+          │ CoalesceBatchesExec: target_batch_size=8192
+          │   RepartitionExec: partitioning=Hash([RainTomorrow@0], 4), input_partitions=4
+          │     AggregateExec: mode=Partial, gby=[RainTomorrow@1 as RainTomorrow], aggr=[avg(weather.MaxTemp)]
+          │       CoalesceBatchesExec: target_batch_size=8192
+          │         FilterExec: RainToday@1 = no, projection=[MaxTemp@0, RainTomorrow@2]
+          │           RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+          │             PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+          │               DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[MaxTemp, RainToday, RainTomorrow], file_type=parquet, predicate=RainToday@1 = no, pruning_predicate=RainToday_null_count@2 != row_count@3 AND RainToday_min@0 <= no AND no <= RainToday_max@1, required_guarantees=[RainToday in (no)]
           └──────────────────────────────────────────────────
         ");
     }
@@ -832,11 +846,12 @@ mod tests {
           │     [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=3
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p7] t1:[p0..p7] t2:[p0..p7] 
-            │ RepartitionExec: partitioning=Hash([RainToday@0, WindGustDir@1], 8), input_partitions=4
-            │   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-            │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday, WindGustDir@1 as WindGustDir], aggr=[]
-            │       PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday, WindGustDir], file_type=parquet
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainToday@0, WindGustDir@1], 8), input_partitions=4
+            │     RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+            │       AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday, WindGustDir@1 as WindGustDir], aggr=[]
+            │         PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │           DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday, WindGustDir], file_type=parquet
             └──────────────────────────────────────────────────
         ");
     }
 
@@ -13,6 +13,7 @@ use arrow_flight::decode::FlightRecordBatchStream;
 use arrow_flight::error::FlightError;
 use bytes::Bytes;
 use dashmap::DashMap;
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion::common::{exec_err, internal_datafusion_err, plan_err};
 use datafusion::error::DataFusionError;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
@@ -172,18 +173,29 @@ impl NetworkBoundary for NetworkShuffleExec {
             return plan_err!("cannot only return wrapped child if on Pending state");
         };
 
-        // TODO: Avoid downcasting once https://github.com/apache/datafusion/pull/17990 is shipped.
-        let Some(r_exe) = pending.input.as_any().downcast_ref::<RepartitionExec>() else {
-            return plan_err!("NetworkShuffleExec.input must always be RepartitionExec");
-        };
-
-        let next_stage_plan = Arc::new(RepartitionExec::try_new(
-            require_one_child(r_exe.children())?,
-            scale_partitioning(r_exe.partitioning(), |p| p * n_tasks),
-        )?);
+        let transformed = Arc::clone(&pending.input).transform_down(|plan| {
+            if let Some(r_exe) = plan.as_any().downcast_ref::<RepartitionExec>() {
+                // Scale the input RepartitionExec to account for all the tasks to which it will
+                // need to fan data out.
+                let scaled = Arc::new(RepartitionExec::try_new(
+                    require_one_child(r_exe.children())?,
+                    scale_partitioning(r_exe.partitioning(), |p| p * n_tasks),
+                )?);
+                Ok(Transformed::new(scaled, true, TreeNodeRecursion::Stop))
+            } else if matches!(plan.output_partitioning(), Partitioning::Hash(_, _)) {
+                // This might be a passthrough node, like a CoalesceBatchesExec or something like that.
+                // This is fine, we can let the node be here.
+                Ok(Transformed::no(plan))
+            } else {
+                return plan_err!(
+                    "NetworkShuffleExec input must be hash partitioned, but {} is not",
+                    plan.name()
+                );
+            }
+        })?;
 
         Ok(InputStageInfo {
-            plan: next_stage_plan,
+            plan: transformed.data,
             task_count: pending.input_tasks,
         })
     }
 
@@ -164,6 +164,7 @@ pub fn settings() -> insta::Settings {
     settings.add_filter(r"input_batches=\d+", "input_batches=<metric>");
     settings.add_filter(r"input_rows=\d+", "input_rows=<metric>");
     settings.add_filter(r"output_batches=\d+", "output_batches=<metric>");
+    settings.add_filter(r"output_bytes=\d+.\d [(B)|(Mb)]", "output_bytes=<metric>");
     settings.add_filter(r"build_mem_used=\d+", "build_mem_used=<metric>");
     settings.add_filter(r"build_time=[\d.]+[a-zA-Zµnms]+", "build_time=<metric>");
     settings.add_filter(r"join_time=[\d.]+[a-zA-Zµnms]+", "join_time=<metric>");