Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
[target.'(target_familiy="unix")']
rustflags = ["-C", "force-frame-pointers=yes"]
[target.'cfg(target_family="unix")']
rustflags = [
"-C", "force-frame-pointers=yes",
# Add dynamic DuckDB library directory to runtime search path.
"-C", "link-arg=-Wl,-rpath,$ORIGIN/../duckdb-v1.3.0",
"-C", "link-arg=-Wl,-rpath,@executable_path/../duckdb-v1.3.0"
]

[target.wasm32-unknown-unknown]
rustflags = ['--cfg', 'getrandom_backend="wasm_js"', '-C', 'target-feature=+atomics']

Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ jobs:
shell: bash
env:
RUST_BACKTRACE: full
LD_LIBRARY_PATH: target/duckdb-v1.3.0
run: |
# Generate data, running each query once to make sure they don't panic.
target/release_debug/${{ matrix.binary_name }} --targets datafusion:parquet -i1 -d gh-json --skip-duckdb-build
Expand Down Expand Up @@ -171,6 +172,7 @@ jobs:
OTEL_EXPORTER_OTLP_ENDPOINT: '${{ secrets.OTEL_EXPORTER_OTLP_ENDPOINT }}'
OTEL_EXPORTER_OTLP_HEADERS: '${{ secrets.OTEL_EXPORTER_OTLP_HEADERS }}'
OTEL_RESOURCE_ATTRIBUTES: 'bench-name=${{ matrix.id }}'
LD_LIBRARY_PATH: target/duckdb-v1.3.0
run: |
target/release_debug/${{ matrix.binary_name }} \
-d gh-json \
Expand All @@ -189,6 +191,7 @@ jobs:
OTEL_EXPORTER_OTLP_ENDPOINT: '${{ secrets.OTEL_EXPORTER_OTLP_ENDPOINT }}'
OTEL_EXPORTER_OTLP_HEADERS: '${{ secrets.OTEL_EXPORTER_OTLP_HEADERS }}'
OTEL_RESOURCE_ATTRIBUTES: 'bench-name=${{ matrix.id }}'
LD_LIBRARY_PATH: target/duckdb-v1.3.0
run: |
target/release_debug/${{ matrix.binary_name }} \
--use-remote-data-dir ${{ matrix.remote_storage }} \
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ vortex-zstd = { version = "0.1.0", path = "./encodings/zstd", default-features =
# END crates published by this project

# No version contraints for unpublished crates.
vortex-duckdb-ext = { path = "./vortex-duckdb-ext", default-features = false }
vortex-duckdb = { path = "./vortex-duckdb", default-features = false }
vortex-ffi = { path = "./vortex-ffi", default-features = false }

Expand Down Expand Up @@ -275,7 +276,8 @@ use_debug = "deny"

[profile.release]
codegen-units = 1
lto = "thin" # attempts to perform optimizations across all crates within the dependency graph
# Turn LTO off, as it breaks when vortex-duckdb-ext is linked.
lto = "off"

[profile.release_debug]
debug = "full"
Expand Down
1 change: 1 addition & 0 deletions bench-vortex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ url = { workspace = true }
uuid = { workspace = true, features = ["v4"] }
vortex = { workspace = true, features = ["object_store", "parquet", "files"] }
vortex-datafusion = { workspace = true }
vortex-duckdb-ext = { workspace = true }
xshell = { workspace = true }

[features]
Expand Down
65 changes: 14 additions & 51 deletions bench-vortex/src/bin/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ use std::sync::Arc;
use std::time::{Duration, Instant};

use bench_vortex::clickbench::{Flavor, clickbench_queries};
use bench_vortex::ddb::{DuckDBExecutor, register_tables};
use bench_vortex::display::{DisplayFormat, print_measurements_json, render_table};
use bench_vortex::engines::ddb2;
use bench_vortex::measurements::QueryMeasurement;
use bench_vortex::metrics::{MetricsSetExt, export_plan_spans};
use bench_vortex::utils::constants::{CLICKBENCH_DATASET, STORAGE_NVME};
use bench_vortex::utils::new_tokio_runtime;
use bench_vortex::{
BenchmarkDataset, Engine, Format, IdempotentPath, Target, ddb, default_env_filter, df,
BenchmarkDataset, Engine, Format, IdempotentPath, Target, default_env_filter, df,
};
use clap::{Parser, value_parser};
use datafusion::prelude;
Expand Down Expand Up @@ -86,21 +86,9 @@ struct DataFusionCtx {
emit_plan: bool,
}

struct DuckDBCtx {
duckdb_path: PathBuf,
}

impl DuckDBCtx {
pub fn duckdb_file(&self, format: Format) -> PathBuf {
let dir = format!("clickbench_partitioned/{}", format.name()).to_data_path();
std::fs::create_dir_all(&dir).vortex_expect("failed to create duckdb data dir");
dir.join("hits.db")
}
}

enum EngineCtx {
DataFusion(DataFusionCtx),
DuckDB(DuckDBCtx),
DuckDB(ddb2::DuckDBCtx),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 2 suffix will go away once TPC-H is moved over too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont forget tpc-ds 🙈

}

impl EngineCtx {
Expand All @@ -113,10 +101,8 @@ impl EngineCtx {
})
}

fn new_with_duckdb(duckdb_path: &Path) -> Self {
EngineCtx::DuckDB(DuckDBCtx {
duckdb_path: duckdb_path.to_path_buf(),
})
fn new_with_duckdb() -> anyhow::Result<Self> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🥳

Ok(EngineCtx::DuckDB(ddb2::DuckDBCtx::new()?))
}

fn to_engine(&self) -> Engine {
Expand Down Expand Up @@ -202,19 +188,6 @@ fn main() -> anyhow::Result<()> {

let mut query_measurements = Vec::new();

let resolved_path = args
.targets
.iter()
.any(|t| t.engine() == Engine::DuckDB)
.then(|| {
let path = ddb::duckdb_executable_path(&args.duckdb_path);
// If the path is to the duckdb-vortex extension, try to rebuild
if args.duckdb_path.is_none() && !args.skip_duckdb_build {
ddb::build_vortex_duckdb();
}
path
});

for target in args.targets.iter() {
let engine = target.engine();
let file_format = target.format();
Expand All @@ -227,9 +200,7 @@ fn main() -> anyhow::Result<()> {

EngineCtx::new_with_datafusion(session_ctx, args.emit_plan)
}
Engine::DuckDB => EngineCtx::new_with_duckdb(
resolved_path.as_ref().vortex_expect("path resolved above"),
),
Engine::DuckDB => EngineCtx::new_with_duckdb()?,
_ => unreachable!("engine not supported"),
};

Expand Down Expand Up @@ -391,12 +362,9 @@ async fn init_data_source(
}
},
EngineCtx::DuckDB(ctx) => match file_format {
Format::Parquet | Format::OnDiskVortex | Format::OnDiskDuckDB => register_tables(
&DuckDBExecutor::new(ctx.duckdb_path.clone(), ctx.duckdb_file(file_format)),
base_url,
file_format,
dataset,
)?,
Format::Parquet | Format::OnDiskVortex | Format::OnDiskDuckDB => {
ctx.register_tables(base_url, file_format, dataset)?;
}
_ => {
vortex_panic!(
"Engine {} Format {file_format} isn't supported on ClickBench",
Expand Down Expand Up @@ -469,14 +437,8 @@ fn execute_queries(
dataset: CLICKBENCH_DATASET.to_owned(),
});
}

EngineCtx::DuckDB(args) => {
let fastest_run = benchmark_duckdb_query(
query_idx,
query_string,
iterations,
&DuckDBExecutor::new(args.duckdb_path.clone(), args.duckdb_file(file_format)),
);
EngineCtx::DuckDB(ctx) => {
let fastest_run = benchmark_duckdb_query(query_idx, query_string, iterations, ctx);

query_measurements.push(QueryMeasurement {
query_idx,
Expand Down Expand Up @@ -567,10 +529,11 @@ fn benchmark_duckdb_query(
query_idx: usize,
query_string: &str,
iterations: usize,
duckdb_executor: &DuckDBExecutor,
duckdb_ctx: &ddb2::DuckDBCtx,
) -> Duration {
(0..iterations).fold(Duration::from_millis(u64::MAX), |fastest, _| {
let duration = ddb::execute_clickbench_query(query_string, duckdb_executor)
let duration = duckdb_ctx
.execute_query(query_string)
.unwrap_or_else(|err| vortex_panic!("query: {query_idx} failed with: {err}"));

fastest.min(duration)
Expand Down
177 changes: 177 additions & 0 deletions bench-vortex/src/engines/ddb2/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
use std::time::{Duration, Instant};

use anyhow::Result;
use log::trace;
use url::Url;
use vortex_duckdb_ext::duckdb::{Connection, Database};

use crate::{BenchmarkDataset, Format};

// TODO: handle S3

#[derive(Debug, Clone)]
enum DuckDBObject {
Table,
View,
}

impl DuckDBObject {
fn to_str(&self) -> &str {
match self {
DuckDBObject::Table => "TABLE",
DuckDBObject::View => "VIEW",
}
}
}

/// DuckDB context for benchmarks.
pub struct DuckDBCtx {
pub db: Database,
pub connection: Connection,
}

impl DuckDBCtx {
pub fn new() -> Result<Self> {
let db = Database::open_in_memory()?;
let connection = db.connect()?;
vortex_duckdb_ext::init(&connection)?;
Ok(Self { db, connection })
}

/// Execute DuckDB queries for benchmarks using the internal connection
pub fn execute_query(&self, query: &str) -> Result<Duration> {
// TODO: handle multiple queries
trace!("execute duckdb query: {}", query);
let time_instant = Instant::now();
self.connection.execute(query)?;
let query_time = time_instant.elapsed();
trace!("query completed in {:.3}s", query_time.as_secs_f64());

Ok(query_time)
}

/// Register tables for benchmarks using the internal connection
pub fn register_tables(
&self,
base_url: &Url,
file_format: Format,
dataset: BenchmarkDataset,
) -> Result<()> {
let object = match file_format {
Format::Parquet | Format::OnDiskVortex => DuckDBObject::View,
Format::OnDiskDuckDB => DuckDBObject::Table,
format => anyhow::bail!("Format {format} isn't supported for DuckDB"),
};

let load_format = match file_format {
// Duckdb loads values from parquet to duckdb
Format::Parquet | Format::OnDiskDuckDB => Format::Parquet,
f => f,
};

let effective_url = self.resolve_storage_url(base_url, load_format, dataset)?;
let extension = match load_format {
Format::Parquet => "parquet",
Format::OnDiskVortex => "vortex",
other => anyhow::bail!("Format {other} isn't supported for DuckDB"),
};

// Generate and execute table registration commands
let commands = self.generate_table_commands(&effective_url, extension, dataset, object);
self.execute_query(&commands)?;
trace!("Executing table registration commands: {}", commands);

Ok(())
}

/// Resolves the storage URL based on dataset and format requirements
fn resolve_storage_url(
&self,
base_url: &Url,
file_format: Format,
dataset: BenchmarkDataset,
) -> Result<Url> {
if file_format == Format::OnDiskVortex {
match dataset.vortex_path(base_url) {
Ok(vortex_url) => {
// Check if the directory exists (for file:// URLs)
if vortex_url.scheme() == "file" {
let path = std::path::Path::new(vortex_url.path());
if !path.exists() {
log::warn!(
"Vortex directory doesn't exist at: {}. Run with DataFusion engine first to generate Vortex files.",
path.display()
);
}
}
Ok(vortex_url)
}
Err(_) => Ok(base_url.clone()),
}
} else if file_format == Format::Parquet {
match dataset.parquet_path(base_url) {
Ok(parquet_url) => Ok(parquet_url),
Err(_) => Ok(base_url.clone()),
}
} else {
Ok(base_url.clone())
}
}

/// Generate SQL commands for table registration.
fn generate_table_commands(
&self,
base_url: &Url,
extension: &str,
dataset: BenchmarkDataset,
duckdb_object: DuckDBObject,
) -> String {
// Base path contains trailing /.
let base_dir = base_url.as_str();
let base_dir = base_dir.strip_prefix("file://").unwrap_or(base_dir);

match dataset {
BenchmarkDataset::TpcH => {
let mut commands = String::new();
let tables = [
"customer", "lineitem", "nation", "orders", "part", "partsupp", "region",
"supplier",
];

for table_name in &tables {
let table_path = format!("{base_dir}{table_name}.{extension}");
commands.push_str(&format!(
"CREATE {} IF NOT EXISTS {table_name} AS SELECT * FROM read_{extension}('{table_path}');\n",
duckdb_object.to_str(),
));
}
commands
}
BenchmarkDataset::ClickBench { single_file } => {
let file_glob = if single_file {
format!("{base_dir}hits.{extension}")
} else {
format!("{base_dir}*.{extension}")
};

format!(
"CREATE {} IF NOT EXISTS hits AS SELECT * FROM read_{extension}('{file_glob}');",
duckdb_object.to_str()
)
}
BenchmarkDataset::TpcDS => {
let mut commands = String::new();
let tables = BenchmarkDataset::TpcDS.tables();

for table_name in tables {
let table_path = format!("{base_dir}{table_name}.{extension}");
commands.push_str(&format!(
"CREATE {} IF NOT EXISTS {table_name} AS SELECT * FROM read_{extension}('{table_path}');\n",
duckdb_object.to_str(),
));
}
commands
}
}
}
}
1 change: 1 addition & 0 deletions bench-vortex/src/engines/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod ddb;
pub mod ddb2;
pub mod df;
2 changes: 1 addition & 1 deletion bench-vortex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub mod tpch;
pub mod utils;

pub use datasets::{BenchmarkDataset, file};
pub use engines::{ddb, df};
pub use engines::{ddb, ddb2, df};
pub use vortex::error::vortex_panic;

// All benchmarks run with mimalloc for consistency.
Expand Down
Loading
Loading