Skip to content

Commit d67c02c

Browse files
authored
Fix: CI sql script (#5838)
Fixes a issue from the previous PR where I removed the `|| true` by accident (comments explain), and also extracts and deduplicates the inlined bash scripts in the `sql-benchmarks.yml` file. --------- Signed-off-by: Connor Tsui <[email protected]>
1 parent a4368bf commit d67c02c

File tree

2 files changed

+134
-97
lines changed

2 files changed

+134
-97
lines changed

.github/scripts/run-sql-bench.sh

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: Apache-2.0
3+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
4+
#
5+
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
6+
# This script is used by the sql-benchmarks.yml workflow.
7+
#
8+
# Usage:
9+
# run-sql-bench.sh <subcommand> <targets> [options]
10+
#
11+
# Arguments:
12+
# subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds)
13+
# targets Comma-separated list of engine:format pairs
14+
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
15+
#
16+
# Options:
17+
# --scale-factor <sf> Scale factor for the benchmark (e.g., 1.0, 10.0)
18+
# --remote-storage <url> Remote storage URL (e.g., s3://bucket/path/)
19+
# If provided, runs in remote mode (no lance support).
20+
# --benchmark-id <id> Benchmark ID for error messages (e.g., tpch-s3)
21+
22+
set -Eeu -o pipefail
23+
24+
subcommand="$1"
25+
targets="$2"
26+
shift 2
27+
28+
scale_factor=""
29+
remote_storage=""
30+
benchmark_id=""
31+
32+
while [[ $# -gt 0 ]]; do
33+
case "$1" in
34+
--scale-factor)
35+
scale_factor="$2"
36+
shift 2
37+
;;
38+
--remote-storage)
39+
remote_storage="$2"
40+
shift 2
41+
;;
42+
--benchmark-id)
43+
benchmark_id="$2"
44+
shift 2
45+
;;
46+
*)
47+
echo "Unknown option: $1" >&2
48+
exit 1
49+
;;
50+
esac
51+
done
52+
53+
is_remote=false
54+
if [[ -n "$remote_storage" ]]; then
55+
is_remote=true
56+
fi
57+
58+
# Lance on remote storage is not supported. The infrastructure to generate and upload lance files
59+
# to S3 does not exist. If you need lance on S3, you must first implement:
60+
# 1. Lance data generation in data-gen (or a separate step)
61+
# 2. Lance data upload to S3 before this step runs
62+
if $is_remote && echo "$targets" | grep -q 'lance'; then
63+
echo "ERROR: Lance format is not supported for remote storage benchmarks."
64+
echo "Remove 'datafusion:lance' from targets for benchmark '${benchmark_id:-unknown}'."
65+
exit 1
66+
fi
67+
68+
# Extract formats for each engine from the targets string.
69+
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
70+
#
71+
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
72+
#
73+
# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
74+
#
75+
# The `|| true` is needed because some benchmarks don't use all engines (e.g., statpopgen only has
76+
# duckdb targets). grep returns exit code 1 when no matches are found.
77+
df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' || true) | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
78+
ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
79+
has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
80+
81+
# Build options string.
82+
opts=""
83+
if $is_remote; then
84+
opts="--opt remote-data-dir=$remote_storage"
85+
fi
86+
if [[ -n "$scale_factor" ]]; then
87+
if [[ -n "$opts" ]]; then
88+
opts="--opt scale-factor=$scale_factor $opts"
89+
else
90+
opts="--opt scale-factor=$scale_factor"
91+
fi
92+
fi
93+
94+
touch results.json
95+
96+
if [[ -n "$df_formats" ]]; then
97+
# shellcheck disable=SC2086
98+
target/release_debug/datafusion-bench "$subcommand" \
99+
-d gh-json \
100+
--formats "$df_formats" \
101+
$opts \
102+
-o df-results.json
103+
104+
cat df-results.json >> results.json
105+
fi
106+
107+
if [[ -n "$ddb_formats" ]]; then
108+
# shellcheck disable=SC2086
109+
target/release_debug/duckdb-bench "$subcommand" \
110+
-d gh-json \
111+
--formats "$ddb_formats" \
112+
$opts \
113+
--delete-duckdb-database \
114+
-o ddb-results.json
115+
116+
cat ddb-results.json >> results.json
117+
fi
118+
119+
# Lance-bench only runs for local benchmarks.
120+
if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/lance-bench" ]]; then
121+
# shellcheck disable=SC2086
122+
target/release_debug/lance-bench "$subcommand" \
123+
-d gh-json \
124+
$opts \
125+
-o lance-results.json
126+
127+
cat lance-results.json >> results.json
128+
fi

.github/workflows/sql-benchmarks.yml

Lines changed: 6 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -190,56 +190,8 @@ jobs:
190190
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
191191
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
192192
run: |
193-
# Extract formats for each engine from the targets string.
194-
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
195-
#
196-
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
197-
198-
# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
199-
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
200-
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
201-
has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")
202-
203-
# Build options string if scale_factor is set
204-
opts=""
205-
if [ -n "${{ matrix.scale_factor }}" ]; then
206-
opts="--opt scale-factor=${{ matrix.scale_factor }}"
207-
fi
208-
209-
touch results.json
210-
211-
# Run datafusion-bench
212-
if [ -n "$df_formats" ]; then
213-
target/release_debug/datafusion-bench ${{ matrix.subcommand }} \
214-
-d gh-json \
215-
--formats "$df_formats" \
216-
$opts \
217-
-o df-results.json
218-
219-
cat df-results.json >> results.json
220-
fi
221-
222-
# Run duckdb-bench
223-
if [ -n "$ddb_formats" ]; then
224-
target/release_debug/duckdb-bench ${{ matrix.subcommand }} \
225-
-d gh-json \
226-
--formats "$ddb_formats" \
227-
$opts \
228-
--delete-duckdb-database \
229-
-o ddb-results.json
230-
231-
cat ddb-results.json >> results.json
232-
fi
233-
234-
# Run lance-bench
235-
if [ "$has_lance" = "true" ] && [ -f "target/release_debug/lance-bench" ]; then
236-
target/release_debug/lance-bench ${{ matrix.subcommand }} \
237-
-d gh-json \
238-
$opts \
239-
-o lance-results.json
240-
241-
cat lance-results.json >> results.json
242-
fi
193+
.github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \
194+
${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
243195
244196
- name: Run ${{ matrix.name }} benchmark (remote)
245197
if: matrix.remote_storage != null && (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false)
@@ -252,53 +204,10 @@ jobs:
252204
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
253205
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
254206
run: |
255-
# Lance on remote storage is not supported. The infrastructure to generate and upload
256-
# lance files to S3 does not exist. If you need lance on S3, you must first implement:
257-
# 1. Lance data generation in data-gen (or a separate step)
258-
# 2. Lance data upload to S3 before this step runs
259-
if echo "${{ matrix.targets }}" | grep -q 'lance'; then
260-
echo "ERROR: Lance format is not supported for remote storage benchmarks."
261-
echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'."
262-
exit 1
263-
fi
264-
265-
# Extract formats for each engine from the targets string.
266-
# Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet"
267-
#
268-
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
269-
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
270-
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
271-
272-
# Build options string if scale_factor is set
273-
opts="--opt remote-data-dir=${{ matrix.remote_storage }}"
274-
if [ -n "${{ matrix.scale_factor }}" ]; then
275-
opts="--opt scale-factor=${{ matrix.scale_factor }} ${opts}"
276-
fi
277-
278-
touch results.json
279-
280-
# Run datafusion-bench with remote storage
281-
if [ -n "$df_formats" ]; then
282-
target/release_debug/datafusion-bench ${{ matrix.subcommand }} \
283-
-d gh-json \
284-
--formats "$df_formats" \
285-
$opts \
286-
-o df-results.json
287-
288-
cat df-results.json >> results.json
289-
fi
290-
291-
# Run duckdb-bench with remote storage
292-
if [ -n "$ddb_formats" ]; then
293-
target/release_debug/duckdb-bench ${{ matrix.subcommand }} \
294-
-d gh-json \
295-
--formats "$ddb_formats" \
296-
$opts \
297-
--delete-duckdb-database \
298-
-o ddb-results.json
299-
300-
cat ddb-results.json >> results.json
301-
fi
207+
.github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \
208+
--remote-storage "${{ matrix.remote_storage }}" \
209+
--benchmark-id "${{ matrix.id }}" \
210+
${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
302211
303212
- name: Install uv
304213
if: inputs.mode == 'pr'

0 commit comments

Comments
 (0)