feat: add shuffle size comparison benchmark for #3882

andygrove · andygrove · commit 296f338cf666 · 2026-04-08T12:05:11.000-06:00
Add a PySpark benchmark that measures shuffle write bytes via the
Spark REST API, making it easy to compare shuffle file sizes between
Spark and Comet shuffle implementations.
diff --git a/benchmarks/pyspark/benchmarks/__init__.py b/benchmarks/pyspark/benchmarks/__init__.py
@@ -26,12 +26,14 @@
 
 from .base import Benchmark
 from .shuffle import ShuffleHashBenchmark, ShuffleRoundRobinBenchmark
+from .shuffle_size import ShuffleSizeBenchmark
 
 
 # Registry of all available benchmarks
 _BENCHMARK_REGISTRY: Dict[str, Type[Benchmark]] = {
     ShuffleHashBenchmark.name(): ShuffleHashBenchmark,
     ShuffleRoundRobinBenchmark.name(): ShuffleRoundRobinBenchmark,
+    ShuffleSizeBenchmark.name(): ShuffleSizeBenchmark,
 }
 
 
@@ -76,4 +78,5 @@ def list_benchmarks() -> List[tuple[str, str]]:
     'list_benchmarks',
     'ShuffleHashBenchmark',
     'ShuffleRoundRobinBenchmark',
+    'ShuffleSizeBenchmark',
 ]
diff --git a/benchmarks/pyspark/benchmarks/shuffle_size.py b/benchmarks/pyspark/benchmarks/shuffle_size.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Shuffle size benchmark for measuring shuffle write bytes.
+
+Measures the actual shuffle write bytes reported by Spark to compare
+shuffle file sizes between Spark and Comet shuffle implementations.
+This is useful for investigating shuffle format overhead (see issue #3882).
+"""
+
+import json
+import urllib.request
+from typing import Dict, Any
+
+from pyspark.sql import DataFrame
+
+from .base import Benchmark
+
+
+def get_shuffle_write_bytes(spark) -> int:
+    """Get total shuffle write bytes from the Spark REST API."""
+    sc = spark.sparkContext
+    ui_url = sc.uiWebUrl
+    url = f"{ui_url}/api/v1/applications/{sc.applicationId}/stages"
+    with urllib.request.urlopen(url) as resp:
+        stages = json.loads(resp.read())
+    return sum(s.get("shuffleWriteBytes", 0) for s in stages)
+
+
+def format_bytes(b: int) -> str:
+    """Format byte count as human-readable string."""
+    if b >= 1024 ** 3:
+        return f"{b / 1024 ** 3:.2f} GiB"
+    elif b >= 1024 ** 2:
+        return f"{b / 1024 ** 2:.2f} MiB"
+    else:
+        return f"{b / 1024:.2f} KiB"
+
+
+class ShuffleSizeBenchmark(Benchmark):
+    """
+    Benchmark that measures shuffle write bytes via the Spark REST API.
+
+    Runs a simple scan -> repartition -> write pipeline and reports
+    the shuffle write size alongside wall-clock time. Useful for
+    comparing shuffle format overhead between Spark and Comet.
+    """
+
+    def __init__(self, spark, data_path: str, mode: str,
+                 num_partitions: int = 200):
+        super().__init__(spark, data_path, mode)
+        self.num_partitions = num_partitions
+
+    @classmethod
+    def name(cls) -> str:
+        return "shuffle-size"
+
+    @classmethod
+    def description(cls) -> str:
+        return "Measure shuffle write bytes (scan -> repartition -> write)"
+
+    def run(self) -> Dict[str, Any]:
+        df = self.spark.read.parquet(self.data_path)
+        row_count = df.count()
+        print(f"Input rows: {row_count:,}")
+
+        schema_desc = ", ".join(
+            f"{f.name}: {f.dataType.simpleString()}" for f in df.schema.fields
+        )
+        print(f"Schema: {schema_desc}")
+
+        output_path = (
+            f"/tmp/shuffle-size-benchmark-output-{self.mode}"
+        )
+
+        def benchmark_operation():
+            df.repartition(self.num_partitions).write.mode(
+                "overwrite"
+            ).parquet(output_path)
+
+        duration_ms = self._time_operation(benchmark_operation)
+
+        shuffle_write_bytes = 0
+        try:
+            shuffle_write_bytes = get_shuffle_write_bytes(self.spark)
+        except Exception as e:
+            print(f"Warning: could not read shuffle metrics: {e}")
+
+        bytes_per_record = (
+            shuffle_write_bytes / row_count if row_count > 0 else 0
+        )
+
+        print(f"Shuffle write: {format_bytes(shuffle_write_bytes)}")
+        print(f"Bytes/record:  {bytes_per_record:.1f}")
+
+        return {
+            "duration_ms": duration_ms,
+            "row_count": row_count,
+            "num_partitions": self.num_partitions,
+            "shuffle_write_bytes": shuffle_write_bytes,
+            "bytes_per_record": round(bytes_per_record, 1),
+        }
diff --git a/benchmarks/pyspark/run_shuffle_size_benchmark.sh b/benchmarks/pyspark/run_shuffle_size_benchmark.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Compare shuffle write sizes between Spark and Comet shuffle.
+#
+# This benchmark measures actual shuffle write bytes reported by Spark
+# to quantify the overhead of Comet's Arrow IPC shuffle format.
+# See https://github.com/apache/datafusion-comet/issues/3882
+#
+# Prerequisites:
+#   - SPARK_HOME set to a Spark 3.5 installation
+#   - Comet JAR built (make)
+#   - Input parquet data generated (see generate_data.py)
+#
+# Usage:
+#   ./run_shuffle_size_benchmark.sh /path/to/parquet/data
+#
+# Environment variables:
+#   COMET_JAR        Path to Comet JAR (default: auto-detected from repo)
+#   SPARK_MASTER     Spark master URL (default: local[*])
+#   EXECUTOR_MEMORY  Executor memory (default: 16g)
+#   OFFHEAP_SIZE     Off-heap memory for Comet (default: 16g)
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DATA_PATH="${1:?Usage: $0 /path/to/parquet/data}"
+COMET_JAR="${COMET_JAR:-$SCRIPT_DIR/../../spark/target/comet-spark-spark3.5_2.12-0.15.0-SNAPSHOT.jar}"
+SPARK_MASTER="${SPARK_MASTER:-local[*]}"
+EXECUTOR_MEMORY="${EXECUTOR_MEMORY:-16g}"
+OFFHEAP_SIZE="${OFFHEAP_SIZE:-16g}"
+
+if [ -z "$SPARK_HOME" ]; then
+  echo "Error: SPARK_HOME is not set"
+  exit 1
+fi
+
+if [ ! -f "$COMET_JAR" ]; then
+  echo "Error: Comet JAR not found at $COMET_JAR"
+  echo "Build with 'make' or set COMET_JAR to the correct path."
+  exit 1
+fi
+
+echo "========================================"
+echo "Shuffle Size Comparison Benchmark"
+echo "========================================"
+echo "Data path:       $DATA_PATH"
+echo "Comet JAR:       $COMET_JAR"
+echo "Spark master:    $SPARK_MASTER"
+echo "Executor memory: $EXECUTOR_MEMORY"
+echo "Off-heap size:   $OFFHEAP_SIZE"
+echo "========================================"
+
+# Run Spark baseline (no Comet)
+echo ""
+echo ">>> Running SPARK (no Comet) shuffle size benchmark..."
+$SPARK_HOME/bin/spark-submit \
+  --master "$SPARK_MASTER" \
+  --executor-memory "$EXECUTOR_MEMORY" \
+  --conf spark.comet.enabled=false \
+  "$SCRIPT_DIR/run_benchmark.py" \
+  --data "$DATA_PATH" \
+  --mode spark \
+  --benchmark shuffle-size
+
+# Run Comet Native shuffle
+echo ""
+echo ">>> Running COMET NATIVE shuffle size benchmark..."
+$SPARK_HOME/bin/spark-submit \
+  --master "$SPARK_MASTER" \
+  --executor-memory "$EXECUTOR_MEMORY" \
+  --jars "$COMET_JAR" \
+  --driver-class-path "$COMET_JAR" \
+  --conf spark.executor.extraClassPath="$COMET_JAR" \
+  --conf spark.plugins=org.apache.spark.CometPlugin \
+  --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
+  --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
+  --conf spark.memory.offHeap.enabled=true \
+  --conf spark.memory.offHeap.size="$OFFHEAP_SIZE" \
+  --conf spark.comet.enabled=true \
+  --conf spark.comet.exec.shuffle.mode=native \
+  --conf spark.comet.explainFallback.enabled=true \
+  "$SCRIPT_DIR/run_benchmark.py" \
+  --data "$DATA_PATH" \
+  --mode native \
+  --benchmark shuffle-size
+
+echo ""
+echo "========================================"
+echo "BENCHMARK COMPLETE"
+echo "========================================"
+echo "Compare 'Shuffle write' and 'Bytes/record' between the two runs above."