Skip to content

Commit dc65ef8

Browse files
authored
Merge branch 'develop' into db/failing-stream
2 parents 475e3aa + da5d463 commit dc65ef8

File tree

423 files changed

+10971
-7795
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

423 files changed

+10971
-7795
lines changed

.github/scripts/run-sql-bench.sh

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: Apache-2.0
3+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
4+
#
5+
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
6+
# This script is used by the sql-benchmarks.yml workflow.
7+
#
8+
# Usage:
9+
# run-sql-bench.sh <subcommand> <targets> [options]
10+
#
11+
# Arguments:
12+
# subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds)
13+
# targets Comma-separated list of engine:format pairs
14+
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
15+
#
16+
# Options:
17+
# --scale-factor <sf> Scale factor for the benchmark (e.g., 1.0, 10.0)
18+
# --remote-storage <url> Remote storage URL (e.g., s3://bucket/path/)
19+
# If provided, runs in remote mode (no lance support).
20+
# --benchmark-id <id> Benchmark ID for error messages (e.g., tpch-s3)
21+
22+
set -Eeu -o pipefail
23+
24+
subcommand="$1"
25+
targets="$2"
26+
shift 2
27+
28+
scale_factor=""
29+
remote_storage=""
30+
benchmark_id=""
31+
32+
while [[ $# -gt 0 ]]; do
33+
case "$1" in
34+
--scale-factor)
35+
scale_factor="$2"
36+
shift 2
37+
;;
38+
--remote-storage)
39+
remote_storage="$2"
40+
shift 2
41+
;;
42+
--benchmark-id)
43+
benchmark_id="$2"
44+
shift 2
45+
;;
46+
*)
47+
echo "Unknown option: $1" >&2
48+
exit 1
49+
;;
50+
esac
51+
done
52+
53+
is_remote=false
54+
if [[ -n "$remote_storage" ]]; then
55+
is_remote=true
56+
fi
57+
58+
# Lance on remote storage is not supported. The infrastructure to generate and upload lance files
59+
# to S3 does not exist. If you need lance on S3, you must first implement:
60+
# 1. Lance data generation in data-gen (or a separate step)
61+
# 2. Lance data upload to S3 before this step runs
62+
if $is_remote && echo "$targets" | grep -q 'lance'; then
63+
echo "ERROR: Lance format is not supported for remote storage benchmarks."
64+
echo "Remove 'datafusion:lance' from targets for benchmark '${benchmark_id:-unknown}'."
65+
exit 1
66+
fi
67+
68+
# Extract formats for each engine from the targets string.
69+
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
70+
#
71+
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
72+
#
73+
# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
74+
#
75+
# The `|| true` is needed because some benchmarks don't use all engines (e.g., statpopgen only has
76+
# duckdb targets). grep returns exit code 1 when no matches are found. Both greps must be in the
77+
# subshell so that `|| true` covers the case where grep -v receives empty input.
78+
df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
79+
ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
80+
has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
81+
82+
# Build options string.
83+
opts=""
84+
if $is_remote; then
85+
opts="--opt remote-data-dir=$remote_storage"
86+
fi
87+
if [[ -n "$scale_factor" ]]; then
88+
if [[ -n "$opts" ]]; then
89+
opts="--opt scale-factor=$scale_factor $opts"
90+
else
91+
opts="--opt scale-factor=$scale_factor"
92+
fi
93+
fi
94+
95+
touch results.json
96+
97+
if [[ -n "$df_formats" ]]; then
98+
# shellcheck disable=SC2086
99+
target/release_debug/datafusion-bench "$subcommand" \
100+
-d gh-json \
101+
--formats "$df_formats" \
102+
$opts \
103+
-o df-results.json
104+
105+
cat df-results.json >> results.json
106+
fi
107+
108+
if [[ -n "$ddb_formats" ]]; then
109+
# shellcheck disable=SC2086
110+
target/release_debug/duckdb-bench "$subcommand" \
111+
-d gh-json \
112+
--formats "$ddb_formats" \
113+
$opts \
114+
--delete-duckdb-database \
115+
-o ddb-results.json
116+
117+
cat ddb-results.json >> results.json
118+
fi
119+
120+
# Lance-bench only runs for local benchmarks.
121+
if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/lance-bench" ]]; then
122+
# shellcheck disable=SC2086
123+
target/release_debug/lance-bench "$subcommand" \
124+
-d gh-json \
125+
$opts \
126+
-o lance-results.json
127+
128+
cat lance-results.json >> results.json
129+
fi

.github/workflows/bench-pr.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ permissions:
1919
contents: read
2020
pull-requests: write # for commenting on PRs
2121
id-token: write # enables AWS-GitHub OIDC
22-
deployments: write # for Polar Signals profiling
2322

2423
jobs:
2524
label_trigger:
@@ -45,9 +44,9 @@ jobs:
4544
strategy:
4645
matrix:
4746
benchmark:
48-
- id: random_access
47+
- id: random-access-bench
4948
name: Random Access
50-
- id: compress
49+
- id: compress-bench
5150
name: Compression
5251
if: ${{ contains(github.event.head_commit.message, '[benchmark]') || github.event.label.name == 'benchmark' && github.event_name == 'pull_request' }}
5352
steps:
@@ -73,7 +72,7 @@ jobs:
7372
env:
7473
RUSTFLAGS: "-C target-cpu=native -C force-frame-pointers=yes"
7574
run: |
76-
cargo build --bin ${{ matrix.benchmark.id }} --package bench-vortex --profile release_debug
75+
cargo build --package ${{ matrix.benchmark.id }} --profile release_debug
7776
7877
- name: Setup Polar Signals
7978
if: github.event.pull_request.head.repo.fork == false
@@ -90,7 +89,7 @@ jobs:
9089
env:
9190
RUST_BACKTRACE: full
9291
run: |
93-
target/release_debug/${{ matrix.benchmark.id }} -d gh-json -o ${{ matrix.benchmark.id }}.json
92+
target/release_debug/${{ matrix.benchmark.id }} -d gh-json -o results.json
9493
9594
- name: Setup AWS CLI
9695
if: github.event.pull_request.head.repo.fork == false
@@ -124,7 +123,7 @@ jobs:
124123
125124
echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
126125
echo '' >> comment.md
127-
uv run --no-project scripts/compare-benchmark-jsons.py base.json ${{ matrix.benchmark.id }}.json "${{ matrix.benchmark.name }}" \
126+
uv run --no-project scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.benchmark.name }}" \
128127
>> comment.md
129128
130129
- name: Comment PR

.github/workflows/bench.yml

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ permissions:
1010
id-token: write # enables AWS-GitHub OIDC
1111
actions: read
1212
contents: write
13-
deployments: write
1413

1514
jobs:
1615
commit-metadata:
@@ -44,9 +43,9 @@ jobs:
4443
strategy:
4544
matrix:
4645
benchmark:
47-
- id: random_access
46+
- id: random-access-bench
4847
name: Random Access
49-
- id: compress
48+
- id: compress-bench
5049
name: Compression
5150
steps:
5251
- uses: runs-on/action@v2
@@ -67,9 +66,8 @@ jobs:
6766
shell: bash
6867
env:
6968
RUSTFLAGS: "-C target-cpu=native -C force-frame-pointers=yes"
70-
# The main difference between this and `bench-pr.yml` is that we add the `lance` feature.
7169
run: |
72-
cargo build --bin ${{ matrix.benchmark.id }} --package bench-vortex --profile release_debug --features lance
70+
cargo build --bin ${{ matrix.benchmark.id }} --profile release_debug --features lance
7371
7472
- name: Setup Polar Signals
7573
uses: polarsignals/[email protected]
@@ -85,7 +83,7 @@ jobs:
8583
env:
8684
RUST_BACKTRACE: full
8785
run: |
88-
target/release_debug/${{ matrix.benchmark.id }} -d gh-json -o ${{ matrix.benchmark.id }}.json --formats parquet,lance,vortex
86+
target/release_debug/${{ matrix.benchmark.id }} --formats parquet,lance,vortex -d gh-json -o results.json
8987
9088
- name: Setup AWS CLI
9189
uses: aws-actions/configure-aws-credentials@v5
@@ -96,7 +94,8 @@ jobs:
9694
- name: Upload Benchmark Results
9795
shell: bash
9896
run: |
99-
bash scripts/cat-s3.sh vortex-benchmark-results-database data.json.gz ${{ matrix.benchmark.id }}.json
97+
bash scripts/cat-s3.sh vortex-benchmark-results-database data.json.gz results.json
98+
10099
sql:
101100
uses: ./.github/workflows/sql-benchmarks.yml
102101
secrets: inherit
@@ -109,73 +108,69 @@ jobs:
109108
"subcommand": "clickbench",
110109
"name": "Clickbench on NVME",
111110
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
112-
"build_args": "--features lance"
111+
"build_lance": true
113112
},
114113
{
115114
"id": "tpch-nvme",
116115
"subcommand": "tpch",
117116
"name": "TPC-H SF=1 on NVME",
118117
"targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
119-
"scale_factor": "--scale-factor 1.0",
120-
"build_args": "--features lance"
118+
"scale_factor": "1.0",
119+
"build_lance": true
121120
},
122121
{
123122
"id": "tpch-s3",
124123
"subcommand": "tpch",
125124
"name": "TPC-H SF=1 on S3",
126-
"local_dir": "bench-vortex/data/tpch/1.0",
125+
"local_dir": "vortex-bench/data/tpch/1.0",
127126
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
128-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
129-
"scale_factor": "--scale-factor 1.0",
130-
"build_args": "--features lance"
127+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
128+
"scale_factor": "1.0"
131129
},
132130
{
133131
"id": "tpch-nvme-10",
134132
"subcommand": "tpch",
135133
"name": "TPC-H SF=10 on NVME",
136134
"targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
137-
"scale_factor": "--scale-factor 10.0",
138-
"build_args": "--features lance"
135+
"scale_factor": "10.0",
136+
"build_lance": true
139137
},
140138
{
141139
"id": "tpch-s3-10",
142140
"subcommand": "tpch",
143141
"name": "TPC-H SF=10 on S3",
144-
"local_dir": "bench-vortex/data/tpch/10.0",
142+
"local_dir": "vortex-bench/data/tpch/10.0",
145143
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
146-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
147-
"scale_factor": "--scale-factor 10.0",
148-
"build_args": "--features lance"
144+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
145+
"scale_factor": "10.0"
149146
},
150147
{
151148
"id": "tpcds-nvme",
152149
"subcommand": "tpcds",
153150
"name": "TPC-DS SF=1 on NVME",
154151
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
155-
"scale_factor": "--scale-factor 1.0"
152+
"scale_factor": "1.0"
156153
},
157154
{
158155
"id": "statpopgen",
159156
"subcommand": "statpopgen",
160157
"name": "Statistical and Population Genetics",
161-
"local_dir": "bench-vortex/data/statpopgen",
158+
"local_dir": "vortex-bench/data/statpopgen",
162159
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
163-
"scale_factor": "--scale-factor 100"
160+
"scale_factor": "100"
164161
},
165162
{
166163
"id": "fineweb",
167164
"subcommand": "fineweb",
168165
"name": "FineWeb NVMe",
169-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
170-
"scale_factor": "--scale-factor 100"
166+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact"
171167
},
172168
{
173169
"id": "fineweb-s3",
174170
"subcommand": "fineweb",
175171
"name": "FineWeb S3",
176-
"local_dir": "bench-vortex/data/fineweb",
172+
"local_dir": "vortex-bench/data/fineweb",
177173
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/",
178-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
179-
"scale_factor": "--scale-factor 100"
174+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact"
180175
},
181176
]

.github/workflows/ci.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,8 @@ jobs:
399399
if: ${{ matrix.suite == 'tpc-h' }}
400400
# We use i2 to ensure that restarting the duckdb connection succeeds
401401
run: |
402-
cargo run --bin query_bench -- tpch -i2 --targets "datafusion:vortex,datafusion:vortex-compact,duckdb:vortex,duckdb:vortex-compact" --scale-factor 0.1
402+
cargo run --bin datafusion-bench -- tpch -i2 --formats "vortex,vortex-compact" --opt scale-factor=0.1
403+
cargo run --bin duckdb-bench -- tpch -i2 --formats "vortex,vortex-compact" --opt scale-factor=0.1
403404
- name: Run FFI Example
404405
if: ${{ matrix.suite == 'ffi' }}
405406
run: |
@@ -411,12 +412,12 @@ jobs:
411412
run: |
412413
grcov . --binary-path target/debug/ -s . -t lcov --llvm --ignore-not-existing \
413414
--threads $(nproc) \
414-
--ignore '../*' --ignore '/*' --ignore 'fuzz/*' --ignore 'bench-vortex/*' \
415+
--ignore '../*' --ignore '/*' --ignore 'fuzz/*' --ignore 'vortex-bench/*' \
415416
--ignore 'home/*' --ignore 'xtask/*' --ignore 'target/*' --ignore 'vortex-error/*' \
416417
--ignore 'vortex-python/*' --ignore 'vortex-jni/*' --ignore 'vortex-flatbuffers/*' \
417418
--ignore 'vortex-proto/*' --ignore 'vortex-tui/*' --ignore 'vortex-datafusion/examples/*' \
418419
--ignore 'vortex-ffi/examples/*' --ignore '*/arbitrary/*' --ignore '*/arbitrary.rs' --ignore 'vortex-cxx/*' \
419-
--ignore 'vortex-gpu/*' \
420+
--ignore 'vortex-gpu/*' --ignore benchmarks/* \
420421
-o ${{ env.GRCOV_OUTPUT_FILE }}
421422
- name: Codecov
422423
uses: codecov/codecov-action@v5
@@ -528,10 +529,11 @@ jobs:
528529
tool: nextest
529530
- name: Rust Tests (Windows)
530531
if: matrix.os == 'windows-x64'
531-
run: cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude bench-vortex --exclude vortex-python --exclude vortex-duckdb --exclude vortex-fuzz
532+
run: |
533+
cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude vortex-bench --exclude vortex-python --exclude vortex-duckdb --exclude vortex-fuzz --exclude duckdb-bench --exclude lance-bench --exclude datafusion-bench --exclude random-access-bench --exclude compress-bench
532534
- name: Rust Tests (Other)
533535
if: matrix.os != 'windows-x64'
534-
run: cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude bench-vortex --exclude vortex-duckdb
536+
run: cargo nextest run --locked --workspace --all-features --no-fail-fast --exclude vortex-bench --exclude vortex-duckdb
535537

536538
build-java:
537539
name: "Java"
@@ -593,7 +595,7 @@ jobs:
593595
RUSTFLAGS: "-C target-feature=+avx2 -C debug-assertions=yes"
594596
run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench
595597
- name: Run benchmarks
596-
uses: CodSpeedHQ/action@346a2d8a8d9d38909abd0bc3d23f773110f076ad
598+
uses: CodSpeedHQ/action@972e3437949c89e1357ebd1a2dbc852fcbc57245
597599
with:
598600
run: cargo codspeed run
599601
token: ${{ secrets.CODSPEED_TOKEN }}

.github/workflows/docs.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ jobs:
5151
path: docs/_build/html
5252

5353
deploy:
54+
permissions:
55+
deployments: write
5456
environment:
5557
name: github-pages
5658
url: ${{ steps.deployment.outputs.page_url }}

0 commit comments

Comments
 (0)