vortex-data
diff --git a/‎.github/scripts/run-sql-bench.sh‎
Lines changed: 129 additions & 0 deletions b/‎.github/scripts/run-sql-bench.sh‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎.github/workflows/bench-pr.yml‎
Lines changed: 5 additions & 6 deletions b/‎.github/workflows/bench-pr.yml‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎.github/workflows/bench.yml‎
Lines changed: 23 additions & 28 deletions b/‎.github/workflows/bench.yml‎
Lines changed: 23 additions & 28 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 8 additions & 6 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎.github/workflows/docs.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/docs.yml‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,129 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+#
+# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
+# This script is used by the sql-benchmarks.yml workflow.
+#
+# Usage:
+#   run-sql-bench.sh <subcommand> <targets> [options]
+#
+# Arguments:
+#   subcommand   The benchmark subcommand (e.g., tpch, clickbench, tpcds)
+#   targets      Comma-separated list of engine:format pairs
+#                (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
+#
+# Options:
+#   --scale-factor <sf>       Scale factor for the benchmark (e.g., 1.0, 10.0)
+#   --remote-storage <url>    Remote storage URL (e.g., s3://bucket/path/)
+#                             If provided, runs in remote mode (no lance support).
+#   --benchmark-id <id>       Benchmark ID for error messages (e.g., tpch-s3)
+
+set -Eeu -o pipefail
+
+subcommand="$1"
+targets="$2"
+shift 2
+
+scale_factor=""
+remote_storage=""
+benchmark_id=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --scale-factor)
+            scale_factor="$2"
+            shift 2
+            ;;
+        --remote-storage)
+            remote_storage="$2"
+            shift 2
+            ;;
+        --benchmark-id)
+            benchmark_id="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            exit 1
+            ;;
+    esac
+done
+
+is_remote=false
+if [[ -n "$remote_storage" ]]; then
+    is_remote=true
+fi
+
+# Lance on remote storage is not supported. The infrastructure to generate and upload lance files
+# to S3 does not exist. If you need lance on S3, you must first implement:
+#   1. Lance data generation in data-gen (or a separate step)
+#   2. Lance data upload to S3 before this step runs
+if $is_remote && echo "$targets" | grep -q 'lance'; then
+    echo "ERROR: Lance format is not supported for remote storage benchmarks."
+    echo "Remove 'datafusion:lance' from targets for benchmark '${benchmark_id:-unknown}'."
+    exit 1
+fi
+
+# Extract formats for each engine from the targets string.
+# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
+#
+# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
+#
+# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
+#
+# The `|| true` is needed because some benchmarks don't use all engines (e.g., statpopgen only has
+# duckdb targets). grep returns exit code 1 when no matches are found. Both greps must be in the
+# subshell so that `|| true` covers the case where grep -v receives empty input.
+df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
+ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
+has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
+
+# Build options string.
+opts=""
+if $is_remote; then
+    opts="--opt remote-data-dir=$remote_storage"
+fi
+if [[ -n "$scale_factor" ]]; then
+    if [[ -n "$opts" ]]; then
+        opts="--opt scale-factor=$scale_factor $opts"
+    else
+        opts="--opt scale-factor=$scale_factor"
+    fi
+fi
+
+touch results.json
+
+if [[ -n "$df_formats" ]]; then
+    # shellcheck disable=SC2086
+    target/release_debug/datafusion-bench "$subcommand" \
+        -d gh-json \
+        --formats "$df_formats" \
+        $opts \
+        -o df-results.json
+
+    cat df-results.json >> results.json
+fi
+
+if [[ -n "$ddb_formats" ]]; then
+    # shellcheck disable=SC2086
+    target/release_debug/duckdb-bench "$subcommand" \
+        -d gh-json \
+        --formats "$ddb_formats" \
+        $opts \
+        --delete-duckdb-database \
+        -o ddb-results.json
+
+    cat ddb-results.json >> results.json
+fi
+
+# Lance-bench only runs for local benchmarks.
+if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/lance-bench" ]]; then
+    # shellcheck disable=SC2086
+    target/release_debug/lance-bench "$subcommand" \
+        -d gh-json \
+        $opts \
+        -o lance-results.json
+
+    cat lance-results.json >> results.json
+fi
@@ -19,7 +19,6 @@ permissions:
   contents: read
   pull-requests: write  # for commenting on PRs
   id-token: write  # enables AWS-GitHub OIDC
-  deployments: write  # for Polar Signals profiling
 
 jobs:
   label_trigger:
@@ -45,9 +44,9 @@ jobs:
     strategy:
       matrix:
         benchmark:
-          - id: random_access
+          - id: random-access-bench
             name: Random Access
-          - id: compress
+          - id: compress-bench
             name: Compression
     if: ${{ contains(github.event.head_commit.message, '[benchmark]') || github.event.label.name == 'benchmark' && github.event_name == 'pull_request' }}
     steps:
@@ -73,7 +72,7 @@ jobs:
         env:
           RUSTFLAGS: "-C target-cpu=native -C force-frame-pointers=yes"
         run: |
-          cargo build --bin ${{ matrix.benchmark.id }} --package bench-vortex --profile release_debug
+          cargo build --package ${{ matrix.benchmark.id }} --profile release_debug
 
       - name: Setup Polar Signals
         if: github.event.pull_request.head.repo.fork == false
@@ -90,7 +89,7 @@ jobs:
         env:
           RUST_BACKTRACE: full
         run: |
-          target/release_debug/${{ matrix.benchmark.id }} -d gh-json -o ${{ matrix.benchmark.id }}.json
+          target/release_debug/${{ matrix.benchmark.id }} -d gh-json -o results.json
 
       - name: Setup AWS CLI
         if: github.event.pull_request.head.repo.fork == false
@@ -124,7 +123,7 @@ jobs:
 
           echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
           echo '' >> comment.md
-          uv run --no-project scripts/compare-benchmark-jsons.py base.json ${{ matrix.benchmark.id }}.json "${{ matrix.benchmark.name }}" \
+          uv run --no-project scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.benchmark.name }}" \
             >> comment.md
 
       - name: Comment PR
 
@@ -10,7 +10,6 @@ permissions:
   id-token: write  # enables AWS-GitHub OIDC
   actions: read
   contents: write
-  deployments: write
 
 jobs:
   commit-metadata:
@@ -44,9 +43,9 @@ jobs:
     strategy:
       matrix:
         benchmark:
-          - id: random_access
+          - id: random-access-bench
             name: Random Access
-          - id: compress
+          - id: compress-bench
             name: Compression
     steps:
       - uses: runs-on/action@v2
@@ -67,9 +66,8 @@ jobs:
         shell: bash
         env:
           RUSTFLAGS: "-C target-cpu=native -C force-frame-pointers=yes"
-        # The main difference between this and `bench-pr.yml` is that we add the `lance` feature.
         run: |
-          cargo build --bin ${{ matrix.benchmark.id }} --package bench-vortex --profile release_debug --features lance
+          cargo build --bin ${{ matrix.benchmark.id }} --profile release_debug --features lance
 
       - name: Setup Polar Signals
         uses: polarsignals/[email protected]
@@ -85,7 +83,7 @@ jobs:
         env:
           RUST_BACKTRACE: full
         run: |
-          target/release_debug/${{ matrix.benchmark.id }} -d gh-json -o ${{ matrix.benchmark.id }}.json --formats parquet,lance,vortex
+          target/release_debug/${{ matrix.benchmark.id }} --formats parquet,lance,vortex -d gh-json -o results.json
 
       - name: Setup AWS CLI
         uses: aws-actions/configure-aws-credentials@v5
@@ -96,7 +94,8 @@ jobs:
       - name: Upload Benchmark Results
         shell: bash
         run: |
-          bash scripts/cat-s3.sh vortex-benchmark-results-database data.json.gz ${{ matrix.benchmark.id }}.json
+          bash scripts/cat-s3.sh vortex-benchmark-results-database data.json.gz results.json
+
   sql:
     uses: ./.github/workflows/sql-benchmarks.yml
     secrets: inherit
@@ -109,73 +108,69 @@ jobs:
             "subcommand": "clickbench",
             "name": "Clickbench on NVME",
             "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
-            "build_args": "--features lance"
+            "build_lance": true
           },
           {
             "id": "tpch-nvme",
             "subcommand": "tpch",
             "name": "TPC-H SF=1 on NVME",
             "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
-            "scale_factor": "--scale-factor 1.0",
-            "build_args": "--features lance"
+            "scale_factor": "1.0",
+            "build_lance": true
           },
           {
             "id": "tpch-s3",
             "subcommand": "tpch",
             "name": "TPC-H SF=1 on S3",
-            "local_dir": "bench-vortex/data/tpch/1.0",
+            "local_dir": "vortex-bench/data/tpch/1.0",
             "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "--scale-factor 1.0",
-            "build_args": "--features lance"
+            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
+            "scale_factor": "1.0"
           },
           {
             "id": "tpch-nvme-10",
             "subcommand": "tpch",
             "name": "TPC-H SF=10 on NVME",
             "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
-            "scale_factor": "--scale-factor 10.0",
-            "build_args": "--features lance"
+            "scale_factor": "10.0",
+            "build_lance": true
           },
           {
             "id": "tpch-s3-10",
             "subcommand": "tpch",
             "name": "TPC-H SF=10 on S3",
-            "local_dir": "bench-vortex/data/tpch/10.0",
+            "local_dir": "vortex-bench/data/tpch/10.0",
             "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "--scale-factor 10.0",
-            "build_args": "--features lance"
+            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
+            "scale_factor": "10.0"
           },
           {
             "id": "tpcds-nvme",
             "subcommand": "tpcds",
             "name": "TPC-DS SF=1 on NVME",
             "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
-            "scale_factor": "--scale-factor 1.0"
+            "scale_factor": "1.0"
           },
           {
             "id": "statpopgen",
             "subcommand": "statpopgen",
             "name": "Statistical and Population Genetics",
-            "local_dir": "bench-vortex/data/statpopgen",
+            "local_dir": "vortex-bench/data/statpopgen",
             "targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "--scale-factor 100"
+            "scale_factor": "100"
           },
           {
             "id": "fineweb",
             "subcommand": "fineweb",
             "name": "FineWeb NVMe",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "--scale-factor 100"
+            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact"
           },
           {
             "id": "fineweb-s3",
             "subcommand": "fineweb",
             "name": "FineWeb S3",
-            "local_dir": "bench-vortex/data/fineweb",
+            "local_dir": "vortex-bench/data/fineweb",
             "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "--scale-factor 100"
+            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact"
           },
         ]
@@ -399,7 +399,8 @@ jobs:
         if: ${{ matrix.suite == 'tpc-h' }}
         # We use i2 to ensure that restarting the duckdb connection succeeds
         run: |
-          cargo run --bin query_bench -- tpch -i2 --targets "datafusion:vortex,datafusion:vortex-compact,duckdb:vortex,duckdb:vortex-compact" --scale-factor 0.1
+          cargo run --bin datafusion-bench -- tpch -i2 --formats "vortex,vortex-compact" --opt scale-factor=0.1
+          cargo run --bin duckdb-bench -- tpch -i2 --formats "vortex,vortex-compact" --opt scale-factor=0.1
       - name: Run FFI Example
         if: ${{ matrix.suite == 'ffi' }}
         run: |
@@ -411,12 +412,12 @@ jobs:
         run: |
           grcov . --binary-path target/debug/ -s . -t lcov --llvm --ignore-not-existing \
             --threads $(nproc) \
-            --ignore '../*' --ignore '/*' --ignore 'fuzz/*' --ignore 'bench-vortex/*' \
+            --ignore '../*' --ignore '/*' --ignore 'fuzz/*' --ignore 'vortex-bench/*' \
             --ignore 'home/*' --ignore 'xtask/*' --ignore 'target/*' --ignore 'vortex-error/*' \
             --ignore 'vortex-python/*' --ignore 'vortex-jni/*' --ignore 'vortex-flatbuffers/*' \
             --ignore 'vortex-proto/*' --ignore 'vortex-tui/*' --ignore 'vortex-datafusion/examples/*' \
             --ignore 'vortex-ffi/examples/*' --ignore '*/arbitrary/*' --ignore '*/arbitrary.rs' --ignore 'vortex-cxx/*' \
-            --ignore 'vortex-gpu/*' \
+            --ignore 'vortex-gpu/*' --ignore benchmarks/* \
             -o ${{ env.GRCOV_OUTPUT_FILE }}
       - name: Codecov
         uses: codecov/codecov-action@v5
@@ -528,10 +529,11 @@ jobs:
           tool: nextest
       - name: Rust Tests (Windows)
         if: matrix.os == 'windows-x64'
-        run: cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude bench-vortex --exclude vortex-python --exclude vortex-duckdb --exclude vortex-fuzz
+        run: |
+          cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude vortex-bench --exclude vortex-python --exclude vortex-duckdb --exclude vortex-fuzz --exclude duckdb-bench --exclude lance-bench --exclude datafusion-bench --exclude random-access-bench --exclude compress-bench
       - name: Rust Tests (Other)
         if: matrix.os != 'windows-x64'
-        run: cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude bench-vortex --exclude vortex-duckdb
+        run: cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude vortex-bench --exclude vortex-duckdb
 
   build-java:
     name: "Java"
@@ -593,7 +595,7 @@ jobs:
           RUSTFLAGS: "-C target-feature=+avx2 -C debug-assertions=yes"
         run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench
       - name: Run benchmarks
-        uses: CodSpeedHQ/action@346a2d8a8d9d38909abd0bc3d23f773110f076ad
+        uses: CodSpeedHQ/action@972e3437949c89e1357ebd1a2dbc852fcbc57245
         with:
           run: cargo codspeed run
           token: ${{ secrets.CODSPEED_TOKEN }}
 
@@ -51,6 +51,8 @@ jobs:
           path: docs/_build/html
 
   deploy:
+    permissions:
+      deployments: write
     environment:
       name: github-pages
       url: ${{ steps.deployment.outputs.page_url }}