Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,8 @@ jobs:
"name": "TPC-H SF=1 on S3",
"local_dir": "vortex-bench/data/tpch/1.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "1.0",
"build_lance": true
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "1.0"
},
{
"id": "tpch-nvme-10",
Expand All @@ -142,9 +141,8 @@ jobs:
"name": "TPC-H SF=10 on S3",
"local_dir": "vortex-bench/data/tpch/10.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "10.0",
"build_lance": true
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "10.0"
},
{
"id": "tpcds-nvme",
Expand Down
9 changes: 4 additions & 5 deletions .github/workflows/nightly-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,24 @@ jobs:
"subcommand": "clickbench",
"name": "Clickbench on NVME",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
"build_args": "--features lance"
"build_lance": true
},
{
"id": "tpch-nvme",
"subcommand": "tpch",
"name": "TPC-H on NVME",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
"scale_factor": "10.0",
"build_args": "--features lance"
"build_lance": true
},
{
"id": "tpch-s3",
"subcommand": "tpch",
"name": "TPC-H on S3",
"local_dir": "vortex-bench/data/tpch/10.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex",
"scale_factor": "10.0",
"build_args": "--features lance"
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex",
"scale_factor": "10.0"
},
{
"id": "tpch-nvme",
Expand Down
28 changes: 23 additions & 5 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,14 @@ jobs:
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
run: |
# Extract formats for each engine (filter out lance from df_formats)
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
# Extract formats for each engine from the targets string.
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
#
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas

# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")

# Build options string if scale_factor is set
Expand Down Expand Up @@ -247,9 +252,22 @@ jobs:
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
run: |
# Extract formats for each engine
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
# Lance on remote storage is not supported. The infrastructure to generate and upload
# lance files to S3 does not exist. If you need lance on S3, you must first implement:
# 1. Lance data generation in data-gen (or a separate step)
# 2. Lance data upload to S3 before this step runs
if echo "${{ matrix.targets }}" | grep -q 'lance'; then
echo "ERROR: Lance format is not supported for remote storage benchmarks."
echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'."
exit 1
fi

# Extract formats for each engine from the targets string.
# Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet"
#
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')

# Build options string if scale_factor is set
opts="--opt remote-data-dir=${{ matrix.remote_storage }}"
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/datafusion-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
Format::OnDiskVortex | Format::VortexCompact => {
Arc::new(VortexFormat::new(SESSION.clone()))
}
_ => unimplemented!(),
Format::OnDiskDuckDB | Format::Lance => {
unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`")
}
}
}
Loading