vortex-data · connortsui20 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+#
+# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
+# This script is used by the sql-benchmarks.yml workflow.
+#
+# Usage:
+#   run-sql-bench.sh <subcommand> <targets> [options]
+#
+# Arguments:
+#   subcommand   The benchmark subcommand (e.g., tpch, clickbench, tpcds)
+#   targets      Comma-separated list of engine:format pairs
+#                (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
+#
+# Options:
+#   --scale-factor <sf>       Scale factor for the benchmark (e.g., 1.0, 10.0)
+#   --remote-storage <url>    Remote storage URL (e.g., s3://bucket/path/)
+#                             If provided, runs in remote mode (no lance support).
+#   --benchmark-id <id>       Benchmark ID for error messages (e.g., tpch-s3)
+
+set -Eeu -o pipefail
+
+subcommand="$1"
+targets="$2"
+shift 2
+
+scale_factor=""
+remote_storage=""
+benchmark_id=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --scale-factor)
+            scale_factor="$2"
+            shift 2
+            ;;
+        --remote-storage)
+            remote_storage="$2"
+            shift 2
+            ;;
+        --benchmark-id)
+            benchmark_id="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            exit 1
+            ;;
+    esac
+done
+
+is_remote=false
+if [[ -n "$remote_storage" ]]; then
+    is_remote=true
+fi
+
+# Lance on remote storage is not supported. The infrastructure to generate and upload lance files
+# to S3 does not exist. If you need lance on S3, you must first implement:
+#   1. Lance data generation in data-gen (or a separate step)
+#   2. Lance data upload to S3 before this step runs
+if $is_remote && echo "$targets" | grep -q 'lance'; then
+    echo "ERROR: Lance format is not supported for remote storage benchmarks."
+    echo "Remove 'datafusion:lance' from targets for benchmark '${benchmark_id:-unknown}'."
+    exit 1
+fi
+
+# Extract formats for each engine from the targets string.
+# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
+#
+# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
+#
+# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
+#
+# The `|| true` is needed because some benchmarks don't use all engines (e.g., statpopgen only has
+# duckdb targets). grep returns exit code 1 when no matches are found.
+df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' || true) | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
+ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
+has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
+
+# Build options string.
+opts=""
+if $is_remote; then
+    opts="--opt remote-data-dir=$remote_storage"
+fi
+if [[ -n "$scale_factor" ]]; then
+    if [[ -n "$opts" ]]; then
+        opts="--opt scale-factor=$scale_factor $opts"
+    else
+        opts="--opt scale-factor=$scale_factor"
+    fi
+fi
+
+touch results.json
+
+if [[ -n "$df_formats" ]]; then
+    # shellcheck disable=SC2086
+    target/release_debug/datafusion-bench "$subcommand" \
+        -d gh-json \
+        --formats "$df_formats" \
+        $opts \
+        -o df-results.json
+
+    cat df-results.json >> results.json
+fi
+
+if [[ -n "$ddb_formats" ]]; then
+    # shellcheck disable=SC2086
+    target/release_debug/duckdb-bench "$subcommand" \
+        -d gh-json \
+        --formats "$ddb_formats" \
+        $opts \
+        --delete-duckdb-database \
+        -o ddb-results.json
+
+    cat ddb-results.json >> results.json
+fi
+
+# Lance-bench only runs for local benchmarks.
+if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/lance-bench" ]]; then
+    # shellcheck disable=SC2086
+    target/release_debug/lance-bench "$subcommand" \
+        -d gh-json \
+        $opts \
+        -o lance-results.json
+
+    cat lance-results.json >> results.json
+fi
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -190,56 +190,8 @@ jobs:
           OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
           OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
         run: |
-          # Extract formats for each engine from the targets string.
-          # Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
-          #
-          # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
-
-          # Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
-          df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
-          ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
-          has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")
-
-          # Build options string if scale_factor is set
-          opts=""
-          if [ -n "${{ matrix.scale_factor }}" ]; then
-            opts="--opt scale-factor=${{ matrix.scale_factor }}"
-          fi
-
-          touch results.json
-
-          # Run datafusion-bench
-          if [ -n "$df_formats" ]; then
-            target/release_debug/datafusion-bench ${{ matrix.subcommand }} \
-              -d gh-json \
-              --formats "$df_formats" \
-              $opts \
-              -o df-results.json
-
-            cat df-results.json >> results.json
-          fi
-
-          # Run duckdb-bench
-          if [ -n "$ddb_formats" ]; then
-            target/release_debug/duckdb-bench ${{ matrix.subcommand }} \
-              -d gh-json \
-              --formats "$ddb_formats" \
-              $opts \
-              --delete-duckdb-database \
-              -o ddb-results.json
-
-            cat ddb-results.json >> results.json
-          fi
-
-          # Run lance-bench
-          if [ "$has_lance" = "true" ] && [ -f "target/release_debug/lance-bench" ]; then
-            target/release_debug/lance-bench ${{ matrix.subcommand }} \
-              -d gh-json \
-              $opts \
-              -o lance-results.json
-
-            cat lance-results.json >> results.json
-          fi
+          .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \
+            ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
 
       - name: Run ${{ matrix.name }} benchmark (remote)
         if: matrix.remote_storage != null && (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false)
@@ -252,53 +204,10 @@ jobs:
           OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
           OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
         run: |
-          # Lance on remote storage is not supported. The infrastructure to generate and upload
-          # lance files to S3 does not exist. If you need lance on S3, you must first implement:
-          # 1. Lance data generation in data-gen (or a separate step)
-          # 2. Lance data upload to S3 before this step runs
-          if echo "${{ matrix.targets }}" | grep -q 'lance'; then
-            echo "ERROR: Lance format is not supported for remote storage benchmarks."
-            echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'."
-            exit 1
-          fi
-
-          # Extract formats for each engine from the targets string.
-          # Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet"
-          #
-          # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
-          df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
-          ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
-
-          # Build options string if scale_factor is set
-          opts="--opt remote-data-dir=${{ matrix.remote_storage }}"
-          if [ -n "${{ matrix.scale_factor }}" ]; then
-            opts="--opt scale-factor=${{ matrix.scale_factor }} ${opts}"
-          fi
-
-          touch results.json
-
-          # Run datafusion-bench with remote storage
-          if [ -n "$df_formats" ]; then
-            target/release_debug/datafusion-bench ${{ matrix.subcommand }} \
-              -d gh-json \
-              --formats "$df_formats" \
-              $opts \
-              -o df-results.json
-
-            cat df-results.json >> results.json
-          fi
-
-          # Run duckdb-bench with remote storage
-          if [ -n "$ddb_formats" ]; then
-            target/release_debug/duckdb-bench ${{ matrix.subcommand }} \
-              -d gh-json \
-              --formats "$ddb_formats" \
-              $opts \
-              --delete-duckdb-database \
-              -o ddb-results.json
-
-            cat ddb-results.json >> results.json
-          fi
+          .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \
+            --remote-storage "${{ matrix.remote_storage }}" \
+            --benchmark-id "${{ matrix.id }}" \
+            ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
 
       - name: Install uv
         if: inputs.mode == 'pr'