feat: measure on-disk shuffle sizes and add short-strings data generator

andygrove · andygrove · commit 3fd1be2135de · 2026-04-09T07:54:52.000-06:00
- Add --schema short-strings option to generate_data.py that produces 7 short random UUID string columns + 1 timestamp, matching the schema from issue #3882 - Update shuffle_size.py to measure actual shuffle .data file sizes on disk via spark.local.dir, in addition to the REST API metric - Update run_shuffle_size_benchmark.sh with dedicated local dirs per run, driver memory, and shuffle enable config
diff --git a/benchmarks/pyspark/benchmarks/shuffle_size.py b/benchmarks/pyspark/benchmarks/shuffle_size.py
@@ -19,12 +19,16 @@
 """
 Shuffle size benchmark for measuring shuffle write bytes.
 
-Measures the actual shuffle write bytes reported by Spark to compare
+Measures the actual shuffle file sizes on disk to compare
 shuffle file sizes between Spark and Comet shuffle implementations.
 This is useful for investigating shuffle format overhead (see issue #3882).
+
+The benchmark sets spark.local.dir to a dedicated temp directory and
+measures the total size of shuffle data files (.data) written there.
 """
 
 import json
+import os
 import urllib.request
 from typing import Dict, Any
 
@@ -43,6 +47,16 @@ def get_shuffle_write_bytes(spark) -> int:
     return sum(s.get("shuffleWriteBytes", 0) for s in stages)
 
 
+def get_shuffle_disk_bytes(local_dir: str) -> int:
+    """Walk spark.local.dir and sum the sizes of all shuffle .data files."""
+    total = 0
+    for root, _dirs, files in os.walk(local_dir):
+        for f in files:
+            if f.endswith(".data"):
+                total += os.path.getsize(os.path.join(root, f))
+    return total
+
+
 def format_bytes(b: int) -> str:
     """Format byte count as human-readable string."""
     if b >= 1024 ** 3:
@@ -55,11 +69,17 @@ def format_bytes(b: int) -> str:
 
 class ShuffleSizeBenchmark(Benchmark):
     """
-    Benchmark that measures shuffle write bytes via the Spark REST API.
+    Benchmark that measures shuffle write bytes on disk.
+
+    Runs a simple scan -> repartition -> count pipeline and reports
+    the actual shuffle data file sizes alongside the Spark REST API
+    metric. Useful for comparing shuffle format overhead between
+    Spark and Comet.
 
-    Runs a simple scan -> repartition -> write pipeline and reports
-    the shuffle write size alongside wall-clock time. Useful for
-    comparing shuffle format overhead between Spark and Comet.
+    NOTE: The Spark session must be configured with spark.local.dir
+    pointing to a dedicated empty directory so that we can measure
+    shuffle file sizes accurately. The run_shuffle_size_benchmark.sh
+    script handles this automatically.
     """
 
     def __init__(self, spark, data_path: str, mode: str,
@@ -73,7 +93,7 @@ def name(cls) -> str:
 
     @classmethod
     def description(cls) -> str:
-        return "Measure shuffle write bytes (scan -> repartition -> write)"
+        return "Measure shuffle write bytes (scan -> repartition -> count)"
 
     def run(self) -> Dict[str, Any]:
         df = self.spark.read.parquet(self.data_path)
@@ -85,6 +105,11 @@ def run(self) -> Dict[str, Any]:
         )
         print(f"Schema: {schema_desc}")
 
+        # Read spark.local.dir so we can measure shuffle files on disk
+        local_dir = self.spark.sparkContext.getConf().get(
+            "spark.local.dir", "/tmp"
+        )
+
         output_path = (
             f"/tmp/shuffle-size-benchmark-output-{self.mode}"
         )
@@ -96,23 +121,32 @@ def benchmark_operation():
 
         duration_ms = self._time_operation(benchmark_operation)
 
-        shuffle_write_bytes = 0
+        # Measure actual shuffle file sizes on disk.
+        # Shuffle .data files persist until SparkContext shutdown,
+        # so they are still available after the job completes.
+        disk_bytes = get_shuffle_disk_bytes(local_dir)
+
+        # Also grab the REST API metric for comparison
+        api_bytes = 0
         try:
-            shuffle_write_bytes = get_shuffle_write_bytes(self.spark)
+            api_bytes = get_shuffle_write_bytes(self.spark)
         except Exception as e:
-            print(f"Warning: could not read shuffle metrics: {e}")
+            print(f"Warning: could not read shuffle metrics from REST API: {e}")
 
-        bytes_per_record = (
-            shuffle_write_bytes / row_count if row_count > 0 else 0
-        )
+        disk_bpr = disk_bytes / row_count if row_count > 0 else 0
+        api_bpr = api_bytes / row_count if row_count > 0 else 0
 
-        print(f"Shuffle write: {format_bytes(shuffle_write_bytes)}")
-        print(f"Bytes/record:  {bytes_per_record:.1f}")
+        print(f"Shuffle disk:       {format_bytes(disk_bytes)} "
+              f"({disk_bpr:.1f} B/record)")
+        print(f"Shuffle API metric: {format_bytes(api_bytes)} "
+              f"({api_bpr:.1f} B/record)")
 
         return {
             "duration_ms": duration_ms,
             "row_count": row_count,
             "num_partitions": self.num_partitions,
-            "shuffle_write_bytes": shuffle_write_bytes,
-            "bytes_per_record": round(bytes_per_record, 1),
+            "shuffle_disk_bytes": disk_bytes,
+            "shuffle_disk_bytes_per_record": round(disk_bpr, 1),
+            "shuffle_api_bytes": api_bytes,
+            "shuffle_api_bytes_per_record": round(api_bpr, 1),
         }
diff --git a/benchmarks/pyspark/generate_data.py b/benchmarks/pyspark/generate_data.py
@@ -412,6 +412,53 @@ def generate_data(output_path: str, num_rows: int, num_partitions: int):
     spark.stop()
 
 
+def generate_short_strings_data(output_path: str, num_rows: int,
+                                num_partitions: int):
+    """Generate data matching the schema from issue #3882.
+
+    Reproduces the problematic scenario: 7 short unique string columns + 1
+    timestamp column. The original reporter saw 3x shuffle overhead with
+    204M records of this shape (25.1 B/record in Comet vs 8.3 B/record in
+    Spark).
+    """
+
+    spark = SparkSession.builder \
+        .appName("ShuffleBenchmark-DataGen-ShortStrings") \
+        .getOrCreate()
+
+    print(f"Generating {num_rows:,} rows with {num_partitions} partitions")
+    print(f"Output path: {output_path}")
+    print("Schema: 7 short unique string columns + 1 timestamp (issue #3882)")
+
+    df = spark.range(0, num_rows, numPartitions=num_partitions)
+
+    # 7 short random string columns + 1 timestamp, mimicking the reporter's
+    # schema. Uses uuid() to generate truly random strings that defeat
+    # compression, exposing Arrow IPC per-batch overhead.
+    df = df.selectExpr(
+        "substring(uuid(), 1, 8) as str_col_1",
+        "substring(uuid(), 1, 8) as str_col_2",
+        "substring(uuid(), 1, 8) as str_col_3",
+        "substring(uuid(), 1, 8) as str_col_4",
+        "substring(uuid(), 1, 8) as str_col_5",
+        "substring(uuid(), 1, 8) as str_col_6",
+        "substring(uuid(), 1, 8) as str_col_7",
+        # Timestamp column
+        "timestamp_seconds(1600000000 + id) as ts_col",
+    )
+
+    print(f"Generated schema with {len(df.columns)} columns")
+    df.printSchema()
+
+    df.write.mode("overwrite").parquet(output_path)
+
+    written_df = spark.read.parquet(output_path)
+    actual_count = written_df.count()
+    print(f"Wrote {actual_count:,} rows to {output_path}")
+
+    spark.stop()
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Generate test data for shuffle benchmark"
@@ -433,13 +480,24 @@ def main():
         default=None,
         help="Number of output partitions (default: auto based on cluster)"
     )
+    parser.add_argument(
+        "--schema", "-s",
+        choices=["wide", "short-strings"],
+        default="wide",
+        help="Schema to generate: 'wide' (100 columns with nested types) "
+             "or 'short-strings' (7 short unique strings + 1 timestamp, "
+             "matches issue #3882)"
+    )
 
     args = parser.parse_args()
 
     # Default partitions to a reasonable number if not specified
     num_partitions = args.partitions if args.partitions else 200
 
-    generate_data(args.output, args.rows, num_partitions)
+    if args.schema == "short-strings":
+        generate_short_strings_data(args.output, args.rows, num_partitions)
+    else:
+        generate_data(args.output, args.rows, num_partitions)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/pyspark/run_shuffle_size_benchmark.sh b/benchmarks/pyspark/run_shuffle_size_benchmark.sh
@@ -68,12 +68,23 @@ echo "Executor memory: $EXECUTOR_MEMORY"
 echo "Off-heap size:   $OFFHEAP_SIZE"
 echo "========================================"
 
+# Use dedicated local dirs so we can measure actual shuffle file sizes on disk
+SPARK_LOCAL_DIR=$(mktemp -d /tmp/spark-shuffle-bench-spark-XXXXXX)
+COMET_LOCAL_DIR=$(mktemp -d /tmp/spark-shuffle-bench-comet-XXXXXX)
+
+cleanup() {
+  rm -rf "$SPARK_LOCAL_DIR" "$COMET_LOCAL_DIR"
+}
+trap cleanup EXIT
+
 # Run Spark baseline (no Comet)
 echo ""
 echo ">>> Running SPARK (no Comet) shuffle size benchmark..."
 $SPARK_HOME/bin/spark-submit \
   --master "$SPARK_MASTER" \
+  --driver-memory "$EXECUTOR_MEMORY" \
   --executor-memory "$EXECUTOR_MEMORY" \
+  --conf spark.local.dir="$SPARK_LOCAL_DIR" \
   --conf spark.comet.enabled=false \
   "$SCRIPT_DIR/run_benchmark.py" \
   --data "$DATA_PATH" \
@@ -85,16 +96,19 @@ echo ""
 echo ">>> Running COMET NATIVE shuffle size benchmark..."
 $SPARK_HOME/bin/spark-submit \
   --master "$SPARK_MASTER" \
+  --driver-memory "$EXECUTOR_MEMORY" \
   --executor-memory "$EXECUTOR_MEMORY" \
   --jars "$COMET_JAR" \
   --driver-class-path "$COMET_JAR" \
   --conf spark.executor.extraClassPath="$COMET_JAR" \
+  --conf spark.local.dir="$COMET_LOCAL_DIR" \
   --conf spark.plugins=org.apache.spark.CometPlugin \
   --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
   --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
   --conf spark.memory.offHeap.enabled=true \
   --conf spark.memory.offHeap.size="$OFFHEAP_SIZE" \
   --conf spark.comet.enabled=true \
+  --conf spark.comet.exec.shuffle.enabled=true \
   --conf spark.comet.exec.shuffle.mode=native \
   --conf spark.comet.explainFallback.enabled=true \
   "$SCRIPT_DIR/run_benchmark.py" \
@@ -106,4 +120,4 @@ echo ""
 echo "========================================"
 echo "BENCHMARK COMPLETE"
 echo "========================================"
-echo "Compare 'Shuffle write' and 'Bytes/record' between the two runs above."
+echo "Compare 'Shuffle disk' bytes/record between the two runs above."