Skip to content

Commit 9d30010

Browse files
Add TPC-H and TPC-DS benchmark comparing DuckDB-DuckLake vs DataFusion-DuckLake (#41)
1 parent c103573 commit 9d30010

File tree

13 files changed

+1625
-2
lines changed

13 files changed

+1625
-2
lines changed

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@ target
44
/.idea/
55
*.iml
66

7-
# Test data (generated by tests/setup_test_data_v2.sql)
8-
tests/test_data/
7+
# Benchmark data and results
8+
benchmark/data/
9+
benchmark/results/

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
[workspace]
2+
members = [".", "benchmark"]
3+
14
[package]
25
name = "datafusion-ducklake"
36
version = "0.0.4"

benchmark/Cargo.toml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
[package]
2+
name = "ducklake-benchmark"
3+
version = "0.1.0"
4+
edition = "2024"
5+
description = "Benchmark comparing DuckDB-DuckLake vs DataFusion-DuckLake performance"
6+
publish = false
7+
8+
[[bin]]
9+
name = "ducklake-benchmark"
10+
path = "src/main.rs"
11+
12+
[[bin]]
13+
name = "generate-tpch"
14+
path = "src/bin/generate_tpch.rs"
15+
16+
[[bin]]
17+
name = "generate-tpcds"
18+
path = "src/bin/generate_tpcds.rs"
19+
20+
[dependencies]
21+
# DataFusion-DuckLake: Uncomment ONE of the following options
22+
23+
# Option 1: Local development (default)
24+
datafusion-ducklake = { path = "..", features = ["metadata-duckdb"] }
25+
26+
# Option 2: Specific git commit
27+
# datafusion-ducklake = { git = "https://github.com/hotdata-dev/datafusion-ducklake", rev = "COMMIT_HASH", features = ["metadata-duckdb"] }
28+
29+
# Option 3: Git branch
30+
# datafusion-ducklake = { git = "https://github.com/hotdata-dev/datafusion-ducklake", branch = "main", features = ["metadata-duckdb"] }
31+
32+
# Option 4: Published crates.io version
33+
# datafusion-ducklake = { version = "0.1.0", features = ["metadata-duckdb"] }
34+
35+
# Core dependencies
36+
datafusion = "50.1.0"
37+
duckdb = { version = "1.4.1", features = ["bundled"] }
38+
tokio = { version = "1", features = ["full"] }
39+
futures = "0.3"
40+
41+
# CLI and reporting
42+
clap = { version = "4", features = ["derive"] }
43+
serde = { version = "1", features = ["derive"] }
44+
serde_json = "1"
45+
csv = "1.3"
46+
chrono = "0.4"
47+
48+
# Utilities
49+
anyhow = "1"
50+
regex = "1"
51+
walkdir = "2"
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
use anyhow::Result;
2+
use clap::Parser;
3+
use duckdb::Connection;
4+
use std::path::PathBuf;
5+
6+
#[derive(Parser)]
7+
#[command(name = "generate-tpcds")]
8+
#[command(about = "Generate TPC-DS data in DuckLake format")]
9+
struct Args {
10+
/// Path for the DuckLake catalog database
11+
#[arg(short, long, default_value = "benchmark/data/tpcds.ducklake")]
12+
catalog: PathBuf,
13+
14+
/// Path for data files (Parquet storage)
15+
#[arg(short, long, default_value = "benchmark/data/tpcds_files")]
16+
data_path: PathBuf,
17+
18+
/// TPC-DS scale factor (1 = 1GB, 10 = 10GB, etc.)
19+
#[arg(short, long, default_value = "1")]
20+
scale_factor: f64,
21+
}
22+
23+
/// TPC-DS tables (24 tables)
24+
const TPCDS_TABLES: &[&str] = &[
25+
"call_center",
26+
"catalog_page",
27+
"catalog_returns",
28+
"catalog_sales",
29+
"customer",
30+
"customer_address",
31+
"customer_demographics",
32+
"date_dim",
33+
"household_demographics",
34+
"income_band",
35+
"inventory",
36+
"item",
37+
"promotion",
38+
"reason",
39+
"ship_mode",
40+
"store",
41+
"store_returns",
42+
"store_sales",
43+
"time_dim",
44+
"warehouse",
45+
"web_page",
46+
"web_returns",
47+
"web_sales",
48+
"web_site",
49+
];
50+
51+
fn main() -> Result<()> {
52+
let args = Args::parse();
53+
54+
println!("TPC-DS DuckLake Data Generator");
55+
println!("==============================");
56+
println!("Catalog: {:?}", args.catalog);
57+
println!("Data path: {:?}", args.data_path);
58+
println!(
59+
"Scale factor: {} (~{}GB)",
60+
args.scale_factor, args.scale_factor
61+
);
62+
println!();
63+
64+
// Ensure directories exist
65+
std::fs::create_dir_all(&args.data_path)?;
66+
if let Some(parent) = args.catalog.parent() {
67+
std::fs::create_dir_all(parent)?;
68+
}
69+
70+
// Remove existing catalog if present
71+
if args.catalog.exists() {
72+
std::fs::remove_file(&args.catalog)?;
73+
}
74+
75+
let conn = Connection::open_in_memory()?;
76+
77+
// Install and load extensions
78+
println!("Installing extensions...");
79+
conn.execute_batch(
80+
r#"
81+
INSTALL tpcds;
82+
LOAD tpcds;
83+
INSTALL ducklake;
84+
LOAD ducklake;
85+
"#,
86+
)?;
87+
88+
// Generate TPC-DS data in memory
89+
println!("Generating TPC-DS data (SF={})...", args.scale_factor);
90+
println!(" This may take a while for large scale factors...");
91+
conn.execute_batch(&format!("CALL dsdgen(sf={})", args.scale_factor))?;
92+
93+
// Create DuckLake catalog
94+
println!("Creating DuckLake catalog...");
95+
let attach_sql = format!(
96+
"ATTACH '{}' AS tpcds_lake (TYPE ducklake, DATA_PATH '{}')",
97+
args.catalog.display(),
98+
args.data_path.display()
99+
);
100+
conn.execute(&attach_sql, [])?;
101+
102+
// Create schema
103+
conn.execute("CREATE SCHEMA IF NOT EXISTS tpcds_lake.main", [])?;
104+
105+
// Copy TPC-DS tables to DuckLake
106+
for table in TPCDS_TABLES {
107+
print!(" Copying {} to DuckLake... ", table);
108+
conn.execute_batch(&format!(
109+
"CREATE TABLE tpcds_lake.main.{} AS SELECT * FROM {}",
110+
table, table
111+
))?;
112+
113+
// Get row count
114+
let count: i64 = conn.query_row(
115+
&format!("SELECT COUNT(*) FROM tpcds_lake.main.{}", table),
116+
[],
117+
|row| row.get(0),
118+
)?;
119+
println!("{} rows", count);
120+
}
121+
122+
println!("\nData generation complete!");
123+
println!("Catalog saved to: {:?}", args.catalog);
124+
125+
// Print data size
126+
let total_size = dir_size(&args.data_path)?;
127+
println!(
128+
"\nTotal data size: {:.2} GB",
129+
total_size as f64 / 1_000_000_000.0
130+
);
131+
132+
Ok(())
133+
}
134+
135+
fn dir_size(path: &PathBuf) -> Result<u64> {
136+
let mut size = 0;
137+
for entry in walkdir::WalkDir::new(path) {
138+
let entry = entry?;
139+
if entry.file_type().is_file() {
140+
size += entry.metadata()?.len();
141+
}
142+
}
143+
Ok(size)
144+
}

benchmark/src/bin/generate_tpch.rs

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
use anyhow::Result;
2+
use clap::Parser;
3+
use duckdb::Connection;
4+
use std::path::PathBuf;
5+
6+
#[derive(Parser)]
7+
#[command(name = "generate-tpch")]
8+
#[command(about = "Generate TPC-H data in DuckLake format")]
9+
struct Args {
10+
/// Path for the DuckLake catalog database
11+
#[arg(short, long, default_value = "benchmark/data/tpch.ducklake")]
12+
catalog: PathBuf,
13+
14+
/// Path for data files (Parquet storage)
15+
#[arg(short, long, default_value = "benchmark/data/tpch_files")]
16+
data_path: PathBuf,
17+
18+
/// TPC-H scale factor (1 = 1GB, 10 = 10GB, etc.)
19+
#[arg(short, long, default_value = "1")]
20+
scale_factor: f64,
21+
}
22+
23+
fn main() -> Result<()> {
24+
let args = Args::parse();
25+
26+
println!("TPC-H DuckLake Data Generator");
27+
println!("=============================");
28+
println!("Catalog: {:?}", args.catalog);
29+
println!("Data path: {:?}", args.data_path);
30+
println!(
31+
"Scale factor: {} (~{}GB)",
32+
args.scale_factor, args.scale_factor
33+
);
34+
println!();
35+
36+
// Ensure directories exist
37+
std::fs::create_dir_all(&args.data_path)?;
38+
if let Some(parent) = args.catalog.parent() {
39+
std::fs::create_dir_all(parent)?;
40+
}
41+
42+
// Remove existing catalog if present
43+
if args.catalog.exists() {
44+
std::fs::remove_file(&args.catalog)?;
45+
}
46+
47+
let conn = Connection::open_in_memory()?;
48+
49+
// Install and load extensions
50+
println!("Installing extensions...");
51+
conn.execute_batch(
52+
r#"
53+
INSTALL tpch;
54+
LOAD tpch;
55+
INSTALL ducklake;
56+
LOAD ducklake;
57+
"#,
58+
)?;
59+
60+
// Generate TPC-H data in memory
61+
println!("Generating TPC-H data (SF={})...", args.scale_factor);
62+
conn.execute_batch(&format!("CALL dbgen(sf={})", args.scale_factor))?;
63+
64+
// Create DuckLake catalog
65+
println!("Creating DuckLake catalog...");
66+
let attach_sql = format!(
67+
"ATTACH '{}' AS tpch_lake (TYPE ducklake, DATA_PATH '{}')",
68+
args.catalog.display(),
69+
args.data_path.display()
70+
);
71+
conn.execute(&attach_sql, [])?;
72+
73+
// Create schema
74+
conn.execute("CREATE SCHEMA IF NOT EXISTS tpch_lake.main", [])?;
75+
76+
// Copy TPC-H tables to DuckLake
77+
let tables =
78+
["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"];
79+
80+
for table in &tables {
81+
println!(" Copying {} to DuckLake...", table);
82+
conn.execute_batch(&format!(
83+
"CREATE TABLE tpch_lake.main.{} AS SELECT * FROM {}",
84+
table, table
85+
))?;
86+
}
87+
88+
println!("\nData generation complete!");
89+
println!("Catalog saved to: {:?}", args.catalog);
90+
91+
// Print table statistics
92+
println!("\nTable Statistics:");
93+
println!("-----------------");
94+
for table in &tables {
95+
let count: i64 = conn.query_row(
96+
&format!("SELECT COUNT(*) FROM tpch_lake.main.{}", table),
97+
[],
98+
|row| row.get(0),
99+
)?;
100+
println!(" {}: {} rows", table, count);
101+
}
102+
103+
// Print data size
104+
let total_size = dir_size(&args.data_path)?;
105+
println!(
106+
"\nTotal data size: {:.2} MB",
107+
total_size as f64 / 1_000_000.0
108+
);
109+
110+
Ok(())
111+
}
112+
113+
fn dir_size(path: &PathBuf) -> Result<u64> {
114+
let mut size = 0;
115+
for entry in walkdir::WalkDir::new(path) {
116+
let entry = entry?;
117+
if entry.file_type().is_file() {
118+
size += entry.metadata()?.len();
119+
}
120+
}
121+
Ok(size)
122+
}

0 commit comments

Comments
 (0)