datafusion-contrib
diff --git a/‎benchmarks/cdk/bin/datafusion-bench.ts‎
Lines changed: 3 additions & 0 deletions b/‎benchmarks/cdk/bin/datafusion-bench.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/cdk/lib/cdk-stack.ts‎
Lines changed: 7 additions & 10 deletions b/‎benchmarks/cdk/lib/cdk-stack.ts‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎src/distributed_planner/distributed_config.rs‎
Lines changed: 7 additions & 0 deletions b/‎src/distributed_planner/distributed_config.rs‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/distributed_planner/distributed_physical_optimizer_rule.rs‎
Lines changed: 72 additions & 53 deletions b/‎src/distributed_planner/distributed_physical_optimizer_rule.rs‎
Lines changed: 72 additions & 53 deletions
diff --git a/‎src/execution_plans/network_shuffle.rs‎
Lines changed: 22 additions & 10 deletions b/‎src/execution_plans/network_shuffle.rs‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎src/test_utils/insta.rs‎
Lines changed: 4 additions & 0 deletions b/‎src/test_utils/insta.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/distributed_aggregation.rs‎
Lines changed: 12 additions & 12 deletions b/‎tests/distributed_aggregation.rs‎
Lines changed: 12 additions & 12 deletions
@@ -15,6 +15,7 @@ async function main () {
     .option('-i, --iterations <number>', 'Number of iterations', '3')
     .option('--files-per-task <number>', 'Files per task', '4')
     .option('--cardinality-task-sf <number>', 'Cardinality task scale factor', '2')
+    .option('--shuffle-batch-size <number>', 'Shuffle batch coalescing size (number of rows)', '8192')
     .option('--collect-metrics <boolean>', 'Propagates metric collection', 'true')
     .option('--query <number>', 'A specific query to run', undefined)
     .parse(process.argv);
@@ -25,6 +26,7 @@ async function main () {
   const iterations = parseInt(options.iterations);
   const filesPerTask = parseInt(options.filesPerTask);
   const cardinalityTaskSf = parseInt(options.cardinalityTaskSf);
+  const shuffleBatchSize = parseInt(options.shuffleBatchSize);
   const collectMetrics = options.collectMetrics === 'true' || options.collectMetrics === 1
 
   // Compare with previous results first
@@ -36,6 +38,7 @@ async function main () {
   await query(`
     SET distributed.files_per_task=${filesPerTask};
     SET distributed.cardinality_task_count_factor=${cardinalityTaskSf};
+    SET distributed.shuffle_batch_size=${shuffleBatchSize};
     SET distributed.collect_metrics=${collectMetrics}
   `)
 
 
@@ -92,16 +92,11 @@ export class CdkStack extends Stack {
     for (let i = 0; i < config.instanceCount; i++) {
       const userData = ec2.UserData.forLinux();
 
-      // Download worker binary from S3 asset
-      userData.addS3DownloadCommand({
-        bucket: workerBinary.bucket,
-        bucketKey: workerBinary.s3ObjectKey,
-        localFile: '/usr/local/bin/worker',
-      });
-
       userData.addCommands(
-        // Make binary executable
-        'chmod +x /usr/local/bin/worker',
+        // Install Rust tooling.
+        'yum install gcc',
+        "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh",
+        'cargo install --locked tokio-console',
 
         // Create systemd service
         `cat > /etc/systemd/system/worker.service << 'EOF'
@@ -160,7 +155,9 @@ sudo journalctl -u worker.service -f -o cat
       description: 'Session Manager commands to connect to instances',
     });
 
-    // Custom resource to restart worker service on every deploy
+    // Downloads the latest version of the worker binary and restarts the systemd service.
+    // This is done instead of the userData.addS3Download() so that the instance does not need
+    // to restart every time a new worker binary is available.
     const restartWorker = new cr.AwsCustomResource(this, 'RestartWorkerService', {
       onUpdate: {
         service: 'SSM',
 
@@ -24,6 +24,13 @@ extensions_options! {
         /// - If a node reduces the cardinality of the data, this factor will decrease.
         /// - In any other situation, this factor is left intact.
         pub cardinality_task_count_factor: f64, default = cardinality_task_count_factor_default()
+        /// Upon shuffling over the network, data streams need to be disassembled in a lot of output
+        /// partitions, which means the resulting streams might contain a lot of tiny record batches
+        /// to be sent over the wire. This parameter controls the batch size in number of rows for
+        /// the CoalesceBatchExec operator that is placed at the top of the stage for sending bigger
+        /// batches over the wire.
+        /// If set to 0, batch coalescing is disabled on network shuffle operations.
+        pub shuffle_batch_size: usize, default = 8192
         /// Propagate collected metrics from all nodes in the plan across network boundaries
         /// so that they can be reconstructed on the head node of the plan.
         pub collect_metrics: bool, default = false
 
@@ -17,6 +17,7 @@ use arrow_flight::decode::FlightRecordBatchStream;
 use arrow_flight::error::FlightError;
 use bytes::Bytes;
 use dashmap::DashMap;
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion::common::{exec_err, internal_datafusion_err, plan_err};
 use datafusion::error::DataFusionError;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
@@ -176,18 +177,29 @@ impl NetworkBoundary for NetworkShuffleExec {
             return plan_err!("cannot only return wrapped child if on Pending state");
         };
 
-        // TODO: Avoid downcasting once https://github.com/apache/datafusion/pull/17990 is shipped.
-        let Some(r_exe) = pending.input.as_any().downcast_ref::<RepartitionExec>() else {
-            return plan_err!("NetworkShuffleExec.input must always be RepartitionExec");
-        };
-
-        let next_stage_plan = Arc::new(RepartitionExec::try_new(
-            require_one_child(r_exe.children())?,
-            scale_partitioning(r_exe.partitioning(), |p| p * n_tasks),
-        )?);
+        let transformed = Arc::clone(&pending.input).transform_down(|plan| {
+            if let Some(r_exe) = plan.as_any().downcast_ref::<RepartitionExec>() {
+                // Scale the input RepartitionExec to account for all the tasks to which it will
+                // need to fan data out.
+                let scaled = Arc::new(RepartitionExec::try_new(
+                    require_one_child(r_exe.children())?,
+                    scale_partitioning(r_exe.partitioning(), |p| p * n_tasks),
+                )?);
+                Ok(Transformed::new(scaled, true, TreeNodeRecursion::Stop))
+            } else if matches!(plan.output_partitioning(), Partitioning::Hash(_, _)) {
+                // This might be a passthrough node, like a CoalesceBatchesExec or something like that.
+                // This is fine, we can let the node be here.
+                Ok(Transformed::no(plan))
+            } else {
+                return plan_err!(
+                    "NetworkShuffleExec input must be hash partitioned, but {} is not",
+                    plan.name()
+                );
+            }
+        })?;
 
         Ok(InputStageInfo {
-            plan: next_stage_plan,
+            plan: transformed.data,
             task_count: pending.input_tasks,
         })
     }
 
@@ -164,6 +164,10 @@ pub fn settings() -> insta::Settings {
     settings.add_filter(r"input_batches=\d+", "input_batches=<metric>");
     settings.add_filter(r"input_rows=\d+", "input_rows=<metric>");
     settings.add_filter(r"output_batches=\d+", "output_batches=<metric>");
+    settings.add_filter(
+        r"output_bytes=\d+.\d [(B)|(KB)|(MB)]",
+        "output_bytes=<metric>",
+    );
     settings.add_filter(r"build_mem_used=\d+", "build_mem_used=<metric>");
     settings.add_filter(r"build_time=[\d.]+[a-zA-Zµnms]+", "build_time=<metric>");
     settings.add_filter(r"join_time=[\d.]+[a-zA-Zµnms]+", "join_time=<metric>");
 
@@ -59,14 +59,14 @@ mod tests {
           │ SortExec: expr=[count(*)@0 ASC NULLS LAST], preserve_partitioning=[true]
           │   ProjectionExec: expr=[count(Int64(1))@1 as count(*), RainToday@0 as RainToday, count(Int64(1))@1 as count(Int64(1))]
           │     AggregateExec: mode=FinalPartitioned, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-          │       CoalesceBatchesExec: target_batch_size=8192
-          │         [Stage 1] => NetworkShuffleExec: output_partitions=3, input_tasks=3
+          │       [Stage 1] => NetworkShuffleExec: output_partitions=3, input_tasks=3
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p5] t1:[p0..p5] t2:[p0..p5] 
-            │ RepartitionExec: partitioning=Hash([RainToday@0], 6), input_partitions=1
-            │   AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-            │     PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │       DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainToday@0], 6), input_partitions=1
+            │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
+            │       PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
             └──────────────────────────────────────────────────
         ",
         );
@@ -141,14 +141,14 @@ mod tests {
           ┌───── Stage 2 ── Tasks: t0:[p0..p2] t1:[p0..p2] 
           │ ProjectionExec: expr=[count(Int64(1))@1 as count(*), RainToday@0 as RainToday]
           │   AggregateExec: mode=FinalPartitioned, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-          │     CoalesceBatchesExec: target_batch_size=8192
-          │       [Stage 1] => NetworkShuffleExec: output_partitions=3, input_tasks=3
+          │     [Stage 1] => NetworkShuffleExec: output_partitions=3, input_tasks=3
           └──────────────────────────────────────────────────
             ┌───── Stage 1 ── Tasks: t0:[p0..p5] t1:[p0..p5] t2:[p0..p5] 
-            │ RepartitionExec: partitioning=Hash([RainToday@0], 6), input_partitions=1
-            │   AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
-            │     PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
-            │       DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
+            │ CoalesceBatchesExec: target_batch_size=8192
+            │   RepartitionExec: partitioning=Hash([RainToday@0], 6), input_partitions=1
+            │     AggregateExec: mode=Partial, gby=[RainToday@0 as RainToday], aggr=[count(Int64(1))]
+            │       PartitionIsolatorExec: t0:[p0,__,__] t1:[__,p0,__] t2:[__,__,p0] 
+            │         DataSourceExec: file_groups={3 groups: [[/testdata/weather/result-000000.parquet], [/testdata/weather/result-000001.parquet], [/testdata/weather/result-000002.parquet]]}, projection=[RainToday], file_type=parquet
             └──────────────────────────────────────────────────
         ",
         );