Skip to content

Commit e79b230

Browse files
authored
duckdb: switch to v2 extension for clickbench benchmark (#3585)
Signed-off-by: Alexander Droste <[email protected]>
1 parent 403729e commit e79b230

File tree

15 files changed

+301
-93
lines changed

15 files changed

+301
-93
lines changed

.cargo/config.toml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1-
[target.'(target_familiy="unix")']
2-
rustflags = ["-C", "force-frame-pointers=yes"]
1+
[target.'cfg(target_family="unix")']
2+
rustflags = [
3+
"-C", "force-frame-pointers=yes",
4+
# Add dynamic DuckDB library directory to runtime search path.
5+
"-C", "link-arg=-Wl,-rpath,$ORIGIN/../duckdb-v1.3.0",
6+
"-C", "link-arg=-Wl,-rpath,@executable_path/../duckdb-v1.3.0"
7+
]
8+
39
[target.wasm32-unknown-unknown]
410
rustflags = ['--cfg', 'getrandom_backend="wasm_js"', '-C', 'target-feature=+atomics']
511

.github/workflows/sql-benchmarks.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ jobs:
140140
shell: bash
141141
env:
142142
RUST_BACKTRACE: full
143+
LD_LIBRARY_PATH: target/duckdb-v1.3.0
143144
run: |
144145
# Generate data, running each query once to make sure they don't panic.
145146
target/release_debug/${{ matrix.binary_name }} --targets datafusion:parquet -i1 -d gh-json --skip-duckdb-build
@@ -171,6 +172,7 @@ jobs:
171172
OTEL_EXPORTER_OTLP_ENDPOINT: '${{ secrets.OTEL_EXPORTER_OTLP_ENDPOINT }}'
172173
OTEL_EXPORTER_OTLP_HEADERS: '${{ secrets.OTEL_EXPORTER_OTLP_HEADERS }}'
173174
OTEL_RESOURCE_ATTRIBUTES: 'bench-name=${{ matrix.id }}'
175+
LD_LIBRARY_PATH: target/duckdb-v1.3.0
174176
run: |
175177
target/release_debug/${{ matrix.binary_name }} \
176178
-d gh-json \
@@ -189,6 +191,7 @@ jobs:
189191
OTEL_EXPORTER_OTLP_ENDPOINT: '${{ secrets.OTEL_EXPORTER_OTLP_ENDPOINT }}'
190192
OTEL_EXPORTER_OTLP_HEADERS: '${{ secrets.OTEL_EXPORTER_OTLP_HEADERS }}'
191193
OTEL_RESOURCE_ATTRIBUTES: 'bench-name=${{ matrix.id }}'
194+
LD_LIBRARY_PATH: target/duckdb-v1.3.0
192195
run: |
193196
target/release_debug/${{ matrix.binary_name }} \
194197
--use-remote-data-dir ${{ matrix.remote_storage }} \

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ vortex-zstd = { version = "0.1.0", path = "./encodings/zstd", default-features =
213213
# END crates published by this project
214214

215215
# No version contraints for unpublished crates.
216+
vortex-duckdb-ext = { path = "./vortex-duckdb-ext", default-features = false }
216217
vortex-duckdb = { path = "./vortex-duckdb", default-features = false }
217218
vortex-ffi = { path = "./vortex-ffi", default-features = false }
218219

@@ -275,7 +276,8 @@ use_debug = "deny"
275276

276277
[profile.release]
277278
codegen-units = 1
278-
lto = "thin" # attempts to perform optimizations across all crates within the dependency graph
279+
# Turn LTO off, as it breaks when vortex-duckdb-ext is linked.
280+
lto = "off"
279281

280282
[profile.release_debug]
281283
debug = "full"

bench-vortex/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ url = { workspace = true }
6464
uuid = { workspace = true, features = ["v4"] }
6565
vortex = { workspace = true, features = ["object_store", "parquet", "files"] }
6666
vortex-datafusion = { workspace = true }
67+
vortex-duckdb-ext = { workspace = true }
6768
xshell = { workspace = true }
6869

6970
[features]

bench-vortex/src/bin/clickbench.rs

Lines changed: 14 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ use std::sync::Arc;
44
use std::time::{Duration, Instant};
55

66
use bench_vortex::clickbench::{Flavor, clickbench_queries};
7-
use bench_vortex::ddb::{DuckDBExecutor, register_tables};
87
use bench_vortex::display::{DisplayFormat, print_measurements_json, render_table};
8+
use bench_vortex::engines::ddb2;
99
use bench_vortex::measurements::QueryMeasurement;
1010
use bench_vortex::metrics::{MetricsSetExt, export_plan_spans};
1111
use bench_vortex::utils::constants::{CLICKBENCH_DATASET, STORAGE_NVME};
1212
use bench_vortex::utils::new_tokio_runtime;
1313
use bench_vortex::{
14-
BenchmarkDataset, Engine, Format, IdempotentPath, Target, ddb, default_env_filter, df,
14+
BenchmarkDataset, Engine, Format, IdempotentPath, Target, default_env_filter, df,
1515
};
1616
use clap::{Parser, value_parser};
1717
use datafusion::prelude;
@@ -86,21 +86,9 @@ struct DataFusionCtx {
8686
emit_plan: bool,
8787
}
8888

89-
struct DuckDBCtx {
90-
duckdb_path: PathBuf,
91-
}
92-
93-
impl DuckDBCtx {
94-
pub fn duckdb_file(&self, format: Format) -> PathBuf {
95-
let dir = format!("clickbench_partitioned/{}", format.name()).to_data_path();
96-
std::fs::create_dir_all(&dir).vortex_expect("failed to create duckdb data dir");
97-
dir.join("hits.db")
98-
}
99-
}
100-
10189
enum EngineCtx {
10290
DataFusion(DataFusionCtx),
103-
DuckDB(DuckDBCtx),
91+
DuckDB(ddb2::DuckDBCtx),
10492
}
10593

10694
impl EngineCtx {
@@ -113,10 +101,8 @@ impl EngineCtx {
113101
})
114102
}
115103

116-
fn new_with_duckdb(duckdb_path: &Path) -> Self {
117-
EngineCtx::DuckDB(DuckDBCtx {
118-
duckdb_path: duckdb_path.to_path_buf(),
119-
})
104+
fn new_with_duckdb() -> anyhow::Result<Self> {
105+
Ok(EngineCtx::DuckDB(ddb2::DuckDBCtx::new()?))
120106
}
121107

122108
fn to_engine(&self) -> Engine {
@@ -202,19 +188,6 @@ fn main() -> anyhow::Result<()> {
202188

203189
let mut query_measurements = Vec::new();
204190

205-
let resolved_path = args
206-
.targets
207-
.iter()
208-
.any(|t| t.engine() == Engine::DuckDB)
209-
.then(|| {
210-
let path = ddb::duckdb_executable_path(&args.duckdb_path);
211-
// If the path is to the duckdb-vortex extension, try to rebuild
212-
if args.duckdb_path.is_none() && !args.skip_duckdb_build {
213-
ddb::build_vortex_duckdb();
214-
}
215-
path
216-
});
217-
218191
for target in args.targets.iter() {
219192
let engine = target.engine();
220193
let file_format = target.format();
@@ -227,9 +200,7 @@ fn main() -> anyhow::Result<()> {
227200

228201
EngineCtx::new_with_datafusion(session_ctx, args.emit_plan)
229202
}
230-
Engine::DuckDB => EngineCtx::new_with_duckdb(
231-
resolved_path.as_ref().vortex_expect("path resolved above"),
232-
),
203+
Engine::DuckDB => EngineCtx::new_with_duckdb()?,
233204
_ => unreachable!("engine not supported"),
234205
};
235206

@@ -391,12 +362,9 @@ async fn init_data_source(
391362
}
392363
},
393364
EngineCtx::DuckDB(ctx) => match file_format {
394-
Format::Parquet | Format::OnDiskVortex | Format::OnDiskDuckDB => register_tables(
395-
&DuckDBExecutor::new(ctx.duckdb_path.clone(), ctx.duckdb_file(file_format)),
396-
base_url,
397-
file_format,
398-
dataset,
399-
)?,
365+
Format::Parquet | Format::OnDiskVortex | Format::OnDiskDuckDB => {
366+
ctx.register_tables(base_url, file_format, dataset)?;
367+
}
400368
_ => {
401369
vortex_panic!(
402370
"Engine {} Format {file_format} isn't supported on ClickBench",
@@ -469,14 +437,8 @@ fn execute_queries(
469437
dataset: CLICKBENCH_DATASET.to_owned(),
470438
});
471439
}
472-
473-
EngineCtx::DuckDB(args) => {
474-
let fastest_run = benchmark_duckdb_query(
475-
query_idx,
476-
query_string,
477-
iterations,
478-
&DuckDBExecutor::new(args.duckdb_path.clone(), args.duckdb_file(file_format)),
479-
);
440+
EngineCtx::DuckDB(ctx) => {
441+
let fastest_run = benchmark_duckdb_query(query_idx, query_string, iterations, ctx);
480442

481443
query_measurements.push(QueryMeasurement {
482444
query_idx,
@@ -567,10 +529,11 @@ fn benchmark_duckdb_query(
567529
query_idx: usize,
568530
query_string: &str,
569531
iterations: usize,
570-
duckdb_executor: &DuckDBExecutor,
532+
duckdb_ctx: &ddb2::DuckDBCtx,
571533
) -> Duration {
572534
(0..iterations).fold(Duration::from_millis(u64::MAX), |fastest, _| {
573-
let duration = ddb::execute_clickbench_query(query_string, duckdb_executor)
535+
let duration = duckdb_ctx
536+
.execute_query(query_string)
574537
.unwrap_or_else(|err| vortex_panic!("query: {query_idx} failed with: {err}"));
575538

576539
fastest.min(duration)
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
use std::time::{Duration, Instant};
2+
3+
use anyhow::Result;
4+
use log::trace;
5+
use url::Url;
6+
use vortex_duckdb_ext::duckdb::{Connection, Database};
7+
8+
use crate::{BenchmarkDataset, Format};
9+
10+
// TODO: handle S3
11+
12+
#[derive(Debug, Clone)]
13+
enum DuckDBObject {
14+
Table,
15+
View,
16+
}
17+
18+
impl DuckDBObject {
19+
fn to_str(&self) -> &str {
20+
match self {
21+
DuckDBObject::Table => "TABLE",
22+
DuckDBObject::View => "VIEW",
23+
}
24+
}
25+
}
26+
27+
/// DuckDB context for benchmarks.
28+
pub struct DuckDBCtx {
29+
pub db: Database,
30+
pub connection: Connection,
31+
}
32+
33+
impl DuckDBCtx {
34+
pub fn new() -> Result<Self> {
35+
let db = Database::open_in_memory()?;
36+
let connection = db.connect()?;
37+
vortex_duckdb_ext::init(&connection)?;
38+
Ok(Self { db, connection })
39+
}
40+
41+
/// Execute DuckDB queries for benchmarks using the internal connection
42+
pub fn execute_query(&self, query: &str) -> Result<Duration> {
43+
// TODO: handle multiple queries
44+
trace!("execute duckdb query: {}", query);
45+
let time_instant = Instant::now();
46+
self.connection.execute(query)?;
47+
let query_time = time_instant.elapsed();
48+
trace!("query completed in {:.3}s", query_time.as_secs_f64());
49+
50+
Ok(query_time)
51+
}
52+
53+
/// Register tables for benchmarks using the internal connection
54+
pub fn register_tables(
55+
&self,
56+
base_url: &Url,
57+
file_format: Format,
58+
dataset: BenchmarkDataset,
59+
) -> Result<()> {
60+
let object = match file_format {
61+
Format::Parquet | Format::OnDiskVortex => DuckDBObject::View,
62+
Format::OnDiskDuckDB => DuckDBObject::Table,
63+
format => anyhow::bail!("Format {format} isn't supported for DuckDB"),
64+
};
65+
66+
let load_format = match file_format {
67+
// Duckdb loads values from parquet to duckdb
68+
Format::Parquet | Format::OnDiskDuckDB => Format::Parquet,
69+
f => f,
70+
};
71+
72+
let effective_url = self.resolve_storage_url(base_url, load_format, dataset)?;
73+
let extension = match load_format {
74+
Format::Parquet => "parquet",
75+
Format::OnDiskVortex => "vortex",
76+
other => anyhow::bail!("Format {other} isn't supported for DuckDB"),
77+
};
78+
79+
// Generate and execute table registration commands
80+
let commands = self.generate_table_commands(&effective_url, extension, dataset, object);
81+
self.execute_query(&commands)?;
82+
trace!("Executing table registration commands: {}", commands);
83+
84+
Ok(())
85+
}
86+
87+
/// Resolves the storage URL based on dataset and format requirements
88+
fn resolve_storage_url(
89+
&self,
90+
base_url: &Url,
91+
file_format: Format,
92+
dataset: BenchmarkDataset,
93+
) -> Result<Url> {
94+
if file_format == Format::OnDiskVortex {
95+
match dataset.vortex_path(base_url) {
96+
Ok(vortex_url) => {
97+
// Check if the directory exists (for file:// URLs)
98+
if vortex_url.scheme() == "file" {
99+
let path = std::path::Path::new(vortex_url.path());
100+
if !path.exists() {
101+
log::warn!(
102+
"Vortex directory doesn't exist at: {}. Run with DataFusion engine first to generate Vortex files.",
103+
path.display()
104+
);
105+
}
106+
}
107+
Ok(vortex_url)
108+
}
109+
Err(_) => Ok(base_url.clone()),
110+
}
111+
} else if file_format == Format::Parquet {
112+
match dataset.parquet_path(base_url) {
113+
Ok(parquet_url) => Ok(parquet_url),
114+
Err(_) => Ok(base_url.clone()),
115+
}
116+
} else {
117+
Ok(base_url.clone())
118+
}
119+
}
120+
121+
/// Generate SQL commands for table registration.
122+
fn generate_table_commands(
123+
&self,
124+
base_url: &Url,
125+
extension: &str,
126+
dataset: BenchmarkDataset,
127+
duckdb_object: DuckDBObject,
128+
) -> String {
129+
// Base path contains trailing /.
130+
let base_dir = base_url.as_str();
131+
let base_dir = base_dir.strip_prefix("file://").unwrap_or(base_dir);
132+
133+
match dataset {
134+
BenchmarkDataset::TpcH => {
135+
let mut commands = String::new();
136+
let tables = [
137+
"customer", "lineitem", "nation", "orders", "part", "partsupp", "region",
138+
"supplier",
139+
];
140+
141+
for table_name in &tables {
142+
let table_path = format!("{base_dir}{table_name}.{extension}");
143+
commands.push_str(&format!(
144+
"CREATE {} IF NOT EXISTS {table_name} AS SELECT * FROM read_{extension}('{table_path}');\n",
145+
duckdb_object.to_str(),
146+
));
147+
}
148+
commands
149+
}
150+
BenchmarkDataset::ClickBench { single_file } => {
151+
let file_glob = if single_file {
152+
format!("{base_dir}hits.{extension}")
153+
} else {
154+
format!("{base_dir}*.{extension}")
155+
};
156+
157+
format!(
158+
"CREATE {} IF NOT EXISTS hits AS SELECT * FROM read_{extension}('{file_glob}');",
159+
duckdb_object.to_str()
160+
)
161+
}
162+
BenchmarkDataset::TpcDS => {
163+
let mut commands = String::new();
164+
let tables = BenchmarkDataset::TpcDS.tables();
165+
166+
for table_name in tables {
167+
let table_path = format!("{base_dir}{table_name}.{extension}");
168+
commands.push_str(&format!(
169+
"CREATE {} IF NOT EXISTS {table_name} AS SELECT * FROM read_{extension}('{table_path}');\n",
170+
duckdb_object.to_str(),
171+
));
172+
}
173+
commands
174+
}
175+
}
176+
}
177+
}

bench-vortex/src/engines/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod ddb;
2+
pub mod ddb2;
23
pub mod df;

bench-vortex/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ pub mod tpch;
2525
pub mod utils;
2626

2727
pub use datasets::{BenchmarkDataset, file};
28-
pub use engines::{ddb, df};
28+
pub use engines::{ddb, ddb2, df};
2929
pub use vortex::error::vortex_panic;
3030

3131
// All benchmarks run with mimalloc for consistency.

0 commit comments

Comments
 (0)