vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 12 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎benchmarks/bench-orchestrator/Cargo.toml‎
Lines changed: 26 additions & 0 deletions b/‎benchmarks/bench-orchestrator/Cargo.toml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎benchmarks/bench-orchestrator/src/aggregator.rs‎
Lines changed: 114 additions & 0 deletions b/‎benchmarks/bench-orchestrator/src/aggregator.rs‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎benchmarks/bench-orchestrator/src/binary.rs‎
Lines changed: 55 additions & 0 deletions b/‎benchmarks/bench-orchestrator/src/binary.rs‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎benchmarks/bench-orchestrator/src/main.rs‎
Lines changed: 153 additions & 0 deletions b/‎benchmarks/bench-orchestrator/src/main.rs‎
Lines changed: 153 additions & 0 deletions
@@ -0,0 +1,26 @@
+[package]
+name = "bench-orchestrator"
+description = "Multi-engine benchmark orchestrator"
+authors = { workspace = true }
+categories = { workspace = true }
+edition = { workspace = true }
+homepage = { workspace = true }
+include = { workspace = true }
+keywords = { workspace = true }
+license = { workspace = true }
+readme = { workspace = true }
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+publish = false
+
+[dependencies]
+anyhow = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing.workspace = true
+vortex-bench = { workspace = true }
+
+[lints]
+workspace = true
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fs::File;
+use std::io::Write;
+use std::io::stdout;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use vortex_bench::Target;
+use vortex_bench::display::DisplayFormat;
+use vortex_bench::display::print_measurements_json;
+use vortex_bench::display::render_table;
+use vortex_bench::measurements::QueryMeasurement;
+use vortex_bench::measurements::QueryMeasurementJson;
+
+/// Aggregates measurements from multiple benchmark runs.
+pub struct MeasurementAggregator {
+    query_measurements: Vec<QueryMeasurement>,
+}
+
+impl MeasurementAggregator {
+    pub fn new() -> Self {
+        Self {
+            query_measurements: Vec::new(),
+        }
+    }
+
+    /// Parse newline-delimited JSON output and add measurements.
+    pub fn add_json_output(&mut self, json_output: &str) -> anyhow::Result<()> {
+        for line in json_output.lines() {
+            let line = line.trim();
+            if line.is_empty() {
+                continue;
+            }
+
+            let json: QueryMeasurementJson = serde_json::from_str(line)?;
+            let measurement = json_to_query_measurement(json);
+            self.query_measurements.push(measurement);
+        }
+
+        Ok(())
+    }
+
+    /// Export results to the specified output.
+    pub fn export(
+        &self,
+        display_format: &DisplayFormat,
+        output_path: Option<&PathBuf>,
+    ) -> anyhow::Result<()> {
+        // Collect unique targets
+        let mut targets: Vec<Target> = self.query_measurements.iter().map(|m| m.target).collect();
+        targets.sort_by_key(|t| (format!("{:?}", t.engine), format!("{}", t.format)));
+        targets.dedup();
+
+        match output_path {
+            Some(path) => {
+                let file = File::create(path)?;
+                self.write_to(display_format, &targets, file)
+            }
+            None => self.write_to(display_format, &targets, stdout().lock()),
+        }
+    }
+
+    fn write_to<W: Write>(
+        &self,
+        display_format: &DisplayFormat,
+        targets: &[Target],
+        mut output: W,
+    ) -> anyhow::Result<()> {
+        match display_format {
+            DisplayFormat::Table => {
+                render_table(&mut output, self.query_measurements.clone(), targets)?;
+            }
+            DisplayFormat::GhJson => {
+                print_measurements_json(&mut output, self.query_measurements.clone())?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Default for MeasurementAggregator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Convert a QueryMeasurementJson back to a QueryMeasurement.
+fn json_to_query_measurement(json: QueryMeasurementJson) -> QueryMeasurement {
+    // Parse query index from name (format: "dataset_qNN/engine:format")
+    let query_idx = json
+        .name
+        .split("_q")
+        .nth(1)
+        .and_then(|s| s.split('/').next())
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(0);
+
+    // Convert nanosecond runtimes back to Duration
+    let runs: Vec<Duration> = json
+        .all_runtimes
+        .iter()
+        .map(|nanos| Duration::from_nanos(*nanos as u64))
+        .collect();
+
+    QueryMeasurement {
+        query_idx,
+        target: json.target,
+        benchmark_dataset: json.dataset,
+        storage: json.storage,
+        runs,
+    }
+}
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::path::PathBuf;
+
+use anyhow::bail;
+use vortex_bench::Engine;
+use vortex_bench::Format;
+use vortex_bench::workspace_root;
+
+/// Returns the binary name for the given engine and format combination.
+/// The format can override the binary (e.g., Lance format always uses lance-bench).
+pub fn binary_name(engine: Engine, format: Option<Format>) -> Option<&'static str> {
+    match (engine, format) {
+        // Lance format always uses lance-bench, regardless of engine
+        (_, Some(Format::Lance)) => Some("lance-bench"),
+        (Engine::DataFusion, _) => Some("df-bench"),
+        (Engine::DuckDB, _) => Some("ddb-bench"),
+        // Arrow is a display label, not a real engine with a binary
+        (Engine::Arrow, _) => None,
+        // Vortex engine uses the DataFusion binary
+        (Engine::Vortex, _) => Some("df-bench"),
+    }
+}
+
+/// Find the benchmark binary for the given engine and format.
+/// Looks in target/release first, then target/debug.
+pub fn find_benchmark_binary(engine: Engine, format: Option<Format>) -> anyhow::Result<PathBuf> {
+    let name = match binary_name(engine, format) {
+        Some(name) => name,
+        None => bail!(
+            "Engine {:?} does not have a dedicated benchmark binary",
+            engine
+        ),
+    };
+
+    let workspace = workspace_root();
+
+    // Check release first, then debug
+    let release_path = workspace.join("target/release").join(name);
+    if release_path.exists() {
+        return Ok(release_path);
+    }
+
+    bail!(
+        "Could not find {} binary.\n\
+         Expected locations:\n\
+         - {}\n\
+         Build the benchmark first:\n\
+         cargo build --release -p {}",
+        name,
+        release_path.display(),
+        name
+    );
+}
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+mod aggregator;
+mod binary;
+mod subprocess;
+mod validation;
+
+use std::path::PathBuf;
+
+use clap::Parser;
+use clap::value_parser;
+use vortex_bench::BenchmarkArg;
+use vortex_bench::Engine;
+use vortex_bench::Format;
+use vortex_bench::Opt;
+use vortex_bench::display::DisplayFormat;
+
+use crate::aggregator::MeasurementAggregator;
+use crate::subprocess::run_benchmark;
+use crate::validation::filter_formats_for_engine;
+
+#[derive(Parser)]
+#[command(name = "bench-orchestrator")]
+#[command(about = "Multi-engine benchmark orchestrator")]
+struct Args {
+    /// Benchmark to run
+    #[arg(value_enum)]
+    benchmark: BenchmarkArg,
+
+    /// Engines to run (comma-separated)
+    #[arg(long, value_delimiter = ',', default_values_t = vec![Engine::DataFusion, Engine::DuckDB])]
+    engines: Vec<Engine>,
+
+    /// Formats to benchmark (comma-separated, auto-filtered per engine)
+    #[arg(long, value_delimiter = ',', default_values_t = vec![Format::Parquet, Format::OnDiskVortex])]
+    formats: Vec<Format>,
+
+    /// Number of iterations per query
+    #[arg(short, long, default_value_t = 5)]
+    iterations: usize,
+
+    /// Specific queries to run (comma-separated)
+    #[arg(short, long, value_delimiter = ',')]
+    queries: Option<Vec<usize>>,
+
+    /// Queries to exclude (comma-separated)
+    #[arg(short, long, value_delimiter = ',')]
+    exclude_queries: Option<Vec<usize>>,
+
+    /// Display format for output
+    #[arg(short, long, default_value_t, value_enum)]
+    display_format: DisplayFormat,
+
+    /// Output file path (stdout if not specified)
+    #[arg(short, long)]
+    output_path: Option<PathBuf>,
+
+    /// Benchmark options (passed through to benchmark binaries)
+    #[arg(long, value_delimiter = ',', value_parser = value_parser!(Opt))]
+    options: Vec<Opt>,
+
+    /// Verbose output
+    #[arg(short, long)]
+    verbose: bool,
+
+    /// Track memory usage
+    #[arg(long, default_value_t = false)]
+    track_memory: bool,
+
+    /// Error on unsupported engine/format combinations (default: warn and skip)
+    #[arg(long, default_value_t = false)]
+    strict: bool,
+}
+
+fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+
+    let mut aggregator = MeasurementAggregator::new();
+
+    for engine in &args.engines {
+        let supported_formats =
+            match filter_formats_for_engine(*engine, &args.formats, args.strict)? {
+                Some(formats) => formats,
+                None => {
+                    if args.verbose {
+                        eprintln!("Skipping engine {engine}: no benchmark binary available");
+                    }
+                    continue;
+                }
+            };
+
+        if supported_formats.is_empty() {
+            if args.verbose {
+                eprintln!(
+                    "Skipping engine {engine}: no supported formats from {:?}",
+                    args.formats
+                );
+            }
+            continue;
+        }
+
+        // Split Lance from other formats (lance-bench is a separate binary)
+        let has_lance = supported_formats.contains(&Format::Lance);
+        let other_formats: Vec<_> = supported_formats
+            .iter()
+            .filter(|f| **f != Format::Lance)
+            .copied()
+            .collect();
+
+        // Run lance-bench for Lance format
+        if has_lance {
+            if args.verbose {
+                eprintln!("Running lance-bench for {engine}",);
+            }
+
+            let json_output = run_benchmark(
+                *engine,
+                None, // lance-bench doesn't accept --formats
+                args.benchmark,
+                args.iterations,
+                args.queries.as_ref(),
+                args.exclude_queries.as_ref(),
+                &args.options,
+                args.track_memory,
+                args.verbose,
+            )?;
+
+            aggregator.add_json_output(&json_output)?;
+        }
+
+        // Run engine's binary for other formats
+        if !other_formats.is_empty() {
+            let json_output = run_benchmark(
+                *engine,
+                Some(&other_formats),
+                args.benchmark,
+                args.iterations,
+                args.queries.as_ref(),
+                args.exclude_queries.as_ref(),
+                &args.options,
+                args.track_memory,
+                args.verbose,
+            )?;
+
+            aggregator.add_json_output(&json_output)?;
+        }
+    }
+
+    aggregator.export(&args.display_format, args.output_path.as_ref())?;
+
+    Ok(())
+}