Skip to content

Commit a4368bf

Browse files
authored
Fix: CI SQL Remote (#5836)
Removes lance from the SQL remote benchmarks on S3 and cleaned up some commands. Signed-off-by: Connor Tsui <[email protected]>
1 parent dde9c0b commit a4368bf

File tree

4 files changed

+34
-17
lines changed

4 files changed

+34
-17
lines changed

.github/workflows/bench.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,8 @@ jobs:
124124
"name": "TPC-H SF=1 on S3",
125125
"local_dir": "vortex-bench/data/tpch/1.0",
126126
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
127-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
128-
"scale_factor": "1.0",
129-
"build_lance": true
127+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
128+
"scale_factor": "1.0"
130129
},
131130
{
132131
"id": "tpch-nvme-10",
@@ -142,9 +141,8 @@ jobs:
142141
"name": "TPC-H SF=10 on S3",
143142
"local_dir": "vortex-bench/data/tpch/10.0",
144143
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
145-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
146-
"scale_factor": "10.0",
147-
"build_lance": true
144+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
145+
"scale_factor": "10.0"
148146
},
149147
{
150148
"id": "tpcds-nvme",

.github/workflows/nightly-bench.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,25 +31,24 @@ jobs:
3131
"subcommand": "clickbench",
3232
"name": "Clickbench on NVME",
3333
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
34-
"build_args": "--features lance"
34+
"build_lance": true
3535
},
3636
{
3737
"id": "tpch-nvme",
3838
"subcommand": "tpch",
3939
"name": "TPC-H on NVME",
4040
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
4141
"scale_factor": "10.0",
42-
"build_args": "--features lance"
42+
"build_lance": true
4343
},
4444
{
4545
"id": "tpch-s3",
4646
"subcommand": "tpch",
4747
"name": "TPC-H on S3",
4848
"local_dir": "vortex-bench/data/tpch/10.0",
4949
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
50-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex",
51-
"scale_factor": "10.0",
52-
"build_args": "--features lance"
50+
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex",
51+
"scale_factor": "10.0"
5352
},
5453
{
5554
"id": "tpch-nvme",

.github/workflows/sql-benchmarks.yml

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,14 @@ jobs:
190190
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
191191
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
192192
run: |
193-
# Extract formats for each engine (filter out lance from df_formats)
194-
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
195-
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
193+
# Extract formats for each engine from the targets string.
194+
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
195+
#
196+
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
197+
198+
# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
199+
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
200+
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
196201
has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")
197202
198203
# Build options string if scale_factor is set
@@ -247,9 +252,22 @@ jobs:
247252
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
248253
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
249254
run: |
250-
# Extract formats for each engine
251-
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
255+
# Lance on remote storage is not supported. The infrastructure to generate and upload
256+
# lance files to S3 does not exist. If you need lance on S3, you must first implement:
257+
# 1. Lance data generation in data-gen (or a separate step)
258+
# 2. Lance data upload to S3 before this step runs
259+
if echo "${{ matrix.targets }}" | grep -q 'lance'; then
260+
echo "ERROR: Lance format is not supported for remote storage benchmarks."
261+
echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'."
262+
exit 1
263+
fi
264+
265+
# Extract formats for each engine from the targets string.
266+
# Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet"
267+
#
268+
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
252269
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
270+
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
253271
254272
# Build options string if scale_factor is set
255273
opts="--opt remote-data-dir=${{ matrix.remote_storage }}"

benchmarks/datafusion-bench/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
107107
Format::OnDiskVortex | Format::VortexCompact => {
108108
Arc::new(VortexFormat::new(SESSION.clone()))
109109
}
110-
_ => unimplemented!(),
110+
Format::OnDiskDuckDB | Format::Lance => {
111+
unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`")
112+
}
111113
}
112114
}

0 commit comments

Comments
 (0)