hotdata-dev
diff --git a/‎Cargo.toml‎
Lines changed: 3 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmark/Cargo.toml‎
Lines changed: 35 additions & 0 deletions b/‎benchmark/Cargo.toml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎benchmark/src/benchmark_parser.rs‎
Lines changed: 218 additions & 0 deletions b/‎benchmark/src/benchmark_parser.rs‎
Lines changed: 218 additions & 0 deletions
diff --git a/‎benchmark/src/bin/generate_tpch.rs‎
Lines changed: 118 additions & 0 deletions b/‎benchmark/src/bin/generate_tpch.rs‎
Lines changed: 118 additions & 0 deletions
@@ -1,3 +1,6 @@
+[workspace]
+members = [".", "benchmark"]
+
 [package]
 name = "datafusion-ducklake"
 version = "0.0.3"
 
@@ -0,0 +1,35 @@
+[package]
+name = "ducklake-benchmark"
+version = "0.1.0"
+edition = "2024"
+description = "Benchmark comparing DuckDB-DuckLake vs DataFusion-DuckLake performance"
+publish = false
+
+[[bin]]
+name = "ducklake-benchmark"
+path = "src/main.rs"
+
+[[bin]]
+name = "generate-tpch"
+path = "src/bin/generate_tpch.rs"
+
+[dependencies]
+# Use the parent crate
+datafusion-ducklake = { path = "..", features = ["metadata-duckdb"] }
+
+# Core dependencies
+datafusion = "50.1.0"
+duckdb = { version = "1.4.1", features = ["bundled"] }
+tokio = { version = "1", features = ["full"] }
+
+# CLI and reporting
+clap = { version = "4", features = ["derive"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+csv = "1.3"
+chrono = "0.4"
+
+# Utilities
+anyhow = "1"
+regex = "1"
+walkdir = "2"
@@ -0,0 +1,218 @@
+use anyhow::{anyhow, Result};
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+
+#[derive(Debug, Clone)]
+pub struct Benchmark {
+    pub name: String,
+    pub group: Option<String>,
+    pub subgroup: Option<String>,
+    pub description: Option<String>,
+    pub load: Option<String>,
+    pub run: String,
+    pub result: Option<String>,
+    pub arguments: HashMap<String, Vec<String>>,
+}
+
+pub fn parse_benchmark_file(path: &Path) -> Result<Vec<Benchmark>> {
+    let content = fs::read_to_string(path)?;
+    parse_benchmark(&content, path.to_string_lossy().to_string())
+}
+
+fn parse_benchmark(content: &str, default_name: String) -> Result<Vec<Benchmark>> {
+    let mut name = default_name;
+    let mut group = None;
+    let mut subgroup = None;
+    let mut description = None;
+    let mut load = None;
+    let mut run = None;
+    let mut result = None;
+    let mut arguments: HashMap<String, Vec<String>> = HashMap::new();
+
+    let mut current_section: Option<&str> = None;
+    let mut section_content = String::new();
+
+    for line in content.lines() {
+        let trimmed = line.trim();
+
+        // Parse header comments
+        if trimmed.starts_with("# name:") {
+            name = trimmed.strip_prefix("# name:").unwrap().trim().to_string();
+            continue;
+        }
+        if trimmed.starts_with("# group:") {
+            group = Some(trimmed.strip_prefix("# group:").unwrap().trim().to_string());
+            continue;
+        }
+        if trimmed.starts_with("# subgroup:") {
+            subgroup = Some(trimmed.strip_prefix("# subgroup:").unwrap().trim().to_string());
+            continue;
+        }
+        if trimmed.starts_with("# description:") {
+            description = Some(trimmed.strip_prefix("# description:").unwrap().trim().to_string());
+            continue;
+        }
+        if trimmed.starts_with("# argument:") {
+            let arg = trimmed.strip_prefix("# argument:").unwrap().trim();
+            if let Some((key, values)) = arg.split_once('=') {
+                let vals: Vec<String> = values.split(',').map(|s| s.trim().to_string()).collect();
+                arguments.insert(key.trim().to_string(), vals);
+            }
+            continue;
+        }
+
+        // Skip other comments
+        if trimmed.starts_with('#') {
+            continue;
+        }
+
+        // Section markers
+        if trimmed == "load" {
+            save_section(&current_section, &section_content, &mut load, &mut run, &mut result);
+            current_section = Some("load");
+            section_content.clear();
+            continue;
+        }
+        if trimmed == "run" {
+            save_section(&current_section, &section_content, &mut load, &mut run, &mut result);
+            current_section = Some("run");
+            section_content.clear();
+            continue;
+        }
+        if trimmed == "result" {
+            save_section(&current_section, &section_content, &mut load, &mut run, &mut result);
+            current_section = Some("result");
+            section_content.clear();
+            continue;
+        }
+
+        // Accumulate section content
+        if current_section.is_some() {
+            if !section_content.is_empty() {
+                section_content.push('\n');
+            }
+            section_content.push_str(line);
+        }
+    }
+
+    // Save final section
+    save_section(&current_section, &section_content, &mut load, &mut run, &mut result);
+
+    let run_sql = run.ok_or_else(|| anyhow!("No 'run' section found in benchmark"))?;
+
+    // Expand arguments into multiple benchmarks
+    let benchmarks = expand_arguments(Benchmark {
+        name,
+        group,
+        subgroup,
+        description,
+        load,
+        run: run_sql,
+        result,
+        arguments,
+    });
+
+    Ok(benchmarks)
+}
+
+fn save_section(
+    current: &Option<&str>,
+    content: &str,
+    load: &mut Option<String>,
+    run: &mut Option<String>,
+    result: &mut Option<String>,
+) {
+    let trimmed = content.trim();
+    if trimmed.is_empty() {
+        return;
+    }
+
+    match current {
+        Some("load") => *load = Some(trimmed.to_string()),
+        Some("run") => *run = Some(trimmed.to_string()),
+        Some("result") => *result = Some(trimmed.to_string()),
+        _ => {}
+    }
+}
+
+fn expand_arguments(base: Benchmark) -> Vec<Benchmark> {
+    if base.arguments.is_empty() {
+        return vec![base];
+    }
+
+    // Get all argument combinations
+    let mut combinations: Vec<HashMap<String, String>> = vec![HashMap::new()];
+
+    for (key, values) in &base.arguments {
+        let mut new_combinations = Vec::new();
+        for combo in &combinations {
+            for value in values {
+                let mut new_combo = combo.clone();
+                new_combo.insert(key.clone(), value.clone());
+                new_combinations.push(new_combo);
+            }
+        }
+        combinations = new_combinations;
+    }
+
+    // Generate benchmarks for each combination
+    combinations
+        .into_iter()
+        .map(|combo| {
+            let mut benchmark = base.clone();
+
+            // Update name with argument values
+            let suffix: String = combo.values().cloned().collect::<Vec<_>>().join("_");
+            if !suffix.is_empty() {
+                benchmark.name = format!("{}_{}", benchmark.name, suffix);
+            }
+
+            // Substitute variables in SQL
+            benchmark.run = substitute_vars(&benchmark.run, &combo);
+            if let Some(ref load) = benchmark.load {
+                benchmark.load = Some(substitute_vars(load, &combo));
+            }
+
+            benchmark
+        })
+        .collect()
+}
+
+fn substitute_vars(sql: &str, vars: &HashMap<String, String>) -> String {
+    let mut result = sql.to_string();
+    for (key, value) in vars {
+        result = result.replace(&format!("${{{}}}", key), value);
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_benchmark() {
+        let content = r#"
+# name: test/sum.benchmark
+# group: micro
+# description: Sum benchmark
+
+load
+CREATE TABLE t AS SELECT range i FROM range(1000);
+
+run
+SELECT SUM(i) FROM t;
+
+result
+499500
+"#;
+
+        let benchmarks = parse_benchmark(content, "default".to_string()).unwrap();
+        assert_eq!(benchmarks.len(), 1);
+        assert_eq!(benchmarks[0].name, "test/sum.benchmark");
+        assert_eq!(benchmarks[0].group, Some("micro".to_string()));
+        assert!(benchmarks[0].load.is_some());
+        assert_eq!(benchmarks[0].run, "SELECT SUM(i) FROM t;");
+    }
+}
@@ -0,0 +1,118 @@
+use anyhow::Result;
+use clap::Parser;
+use duckdb::Connection;
+use std::path::PathBuf;
+
+#[derive(Parser)]
+#[command(name = "generate-tpch")]
+#[command(about = "Generate TPC-H data in DuckLake format")]
+struct Args {
+    /// Path for the DuckLake catalog database
+    #[arg(short, long, default_value = "benchmark/data/tpch.ducklake")]
+    catalog: PathBuf,
+
+    /// Path for data files (Parquet storage)
+    #[arg(short, long, default_value = "benchmark/data/tpch_files")]
+    data_path: PathBuf,
+
+    /// TPC-H scale factor (1 = 1GB, 10 = 10GB, etc.)
+    #[arg(short, long, default_value = "1")]
+    scale_factor: f64,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("TPC-H DuckLake Data Generator");
+    println!("=============================");
+    println!("Catalog: {:?}", args.catalog);
+    println!("Data path: {:?}", args.data_path);
+    println!("Scale factor: {} (~{}GB)", args.scale_factor, args.scale_factor);
+    println!();
+
+    // Ensure directories exist
+    std::fs::create_dir_all(&args.data_path)?;
+    if let Some(parent) = args.catalog.parent() {
+        std::fs::create_dir_all(parent)?;
+    }
+
+    // Remove existing catalog if present
+    if args.catalog.exists() {
+        std::fs::remove_file(&args.catalog)?;
+    }
+
+    let conn = Connection::open_in_memory()?;
+
+    // Install and load extensions
+    println!("Installing extensions...");
+    conn.execute_batch(
+        r#"
+        INSTALL tpch;
+        LOAD tpch;
+        INSTALL ducklake;
+        LOAD ducklake;
+        "#,
+    )?;
+
+    // Generate TPC-H data in memory
+    println!("Generating TPC-H data (SF={})...", args.scale_factor);
+    conn.execute_batch(&format!("CALL dbgen(sf={})", args.scale_factor))?;
+
+    // Create DuckLake catalog
+    println!("Creating DuckLake catalog...");
+    let attach_sql = format!(
+        "ATTACH '{}' AS tpch_lake (TYPE ducklake, DATA_PATH '{}')",
+        args.catalog.display(),
+        args.data_path.display()
+    );
+    conn.execute(&attach_sql, [])?;
+
+    // Create schema
+    conn.execute("CREATE SCHEMA IF NOT EXISTS tpch_lake.main", [])?;
+
+    // Copy TPC-H tables to DuckLake
+    let tables = [
+        "customer", "lineitem", "nation", "orders",
+        "part", "partsupp", "region", "supplier"
+    ];
+
+    for table in &tables {
+        println!("  Copying {} to DuckLake...", table);
+        conn.execute_batch(&format!(
+            "CREATE TABLE tpch_lake.main.{} AS SELECT * FROM {}",
+            table, table
+        ))?;
+    }
+
+    println!("\nData generation complete!");
+    println!("Catalog saved to: {:?}", args.catalog);
+
+    // Print table statistics
+    println!("\nTable Statistics:");
+    println!("-----------------");
+    for table in &tables {
+        let count: i64 = conn.query_row(
+            &format!("SELECT COUNT(*) FROM tpch_lake.main.{}", table),
+            [],
+            |row| row.get(0),
+        )?;
+        println!("  {}: {} rows", table, count);
+    }
+
+    // Print data size
+    let total_size = dir_size(&args.data_path)?;
+    println!("\nTotal data size: {:.2} MB", total_size as f64 / 1_000_000.0);
+
+    Ok(())
+}
+
+fn dir_size(path: &PathBuf) -> Result<u64> {
+    let mut size = 0;
+    for entry in walkdir::WalkDir::new(path) {
+        let entry = entry?;
+        if entry.file_type().is_file() {
+            size += entry.metadata()?.len();
+        }
+    }
+    Ok(size)
+}