Skip to content

Commit 44bb644

Browse files
authored
feat: statistical and population genetics benchmark queries and dataset (#4175)
On my MacBook. I fear I've done something wrong with the duckdb setup (cc: @joseph-isaacs , who probably knows more about the benchmark set up than I do). This is at the 100,000 row scale. At smaller scales, we compare less favorably. ``` ┌───────────┬───────────────────┬────────────────────────┬────────────────────┐ │ Benchmark │ parquet │ vortex-file-compressed │ duckdb │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 0 │ 119973 μs (1.00) │ 4761 μs (0.04) │ 470 μs (0.00) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 1 │ 168210 μs (1.00) │ 17989 μs (0.11) │ 161105 μs (0.96) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 2 │ 495396 μs (1.00) │ 302489 μs (0.61) │ 4536144 μs (9.16) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 3 │ 899974 μs (1.00) │ 721113 μs (0.80) │ 8923671 μs (9.92) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 4 │ 919913 μs (1.00) │ 705867 μs (0.77) │ 8930304 μs (9.71) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 5 │ 593900 μs (1.00) │ 348673 μs (0.59) │ 5345238 μs (9.00) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 6 │ 1841454 μs (1.00) │ 1666978 μs (0.91) │ 17682322 μs (9.60) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 7 │ 1132130 μs (1.00) │ 910304 μs (0.80) │ 11115571 μs (9.82) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 8 │ 544358 μs (1.00) │ 209562 μs (0.38) │ 5928794 μs (10.89) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 9 │ 552524 μs (1.00) │ 232558 μs (0.42) │ 5927320 μs (10.73) │ ├───────────┼───────────────────┼────────────────────────┼────────────────────┤ │ 10 │ 781099 μs (1.00) │ 559999 μs (0.72) │ 7221919 μs (9.25) │ └───────────┴───────────────────┴────────────────────────┴────────────────────┘ ``` --------- Signed-off-by: Daniel King <[email protected]> Signed-off-by: Daniel King <[email protected]>
1 parent 82ed173 commit 44bb644

File tree

16 files changed

+1446
-11
lines changed

16 files changed

+1446
-11
lines changed

.github/workflows/sql-benchmarks.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@ on:
4545
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
4646
"scale_factor": "--scale-factor 1.0"
4747
},
48+
{
49+
"id": "statpopgen",
50+
"subcommand": "statpopgen",
51+
"name": "Statistical and Population Genetics",
52+
"local_dir": "bench-vortex/data/statpopgen",
53+
"targets": "duckdb:parquet,duckdb:vortex",
54+
"scale_factor": "--scale-factor 100"
55+
},
4856
]
4957
5058
jobs:

Cargo.lock

Lines changed: 96 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ enum-iterator = "2.0.0"
103103
erased-serde = "0.4"
104104
fastlanes = "0.2.0"
105105
flatbuffers = "25.2.10"
106+
flate2 = "1.1.2"
106107
flume = "0.11"
107108
fsst-rs = "0.5.2"
108109
futures = { version = "0.3.31", default-features = false }
@@ -126,6 +127,8 @@ memmap2 = "0.9.5"
126127
mimalloc = "0.1.42"
127128
moka = { version = "0.12.10", default-features = false }
128129
multiversion = "0.8.0"
130+
noodles-bgzf = "0.42.0"
131+
noodles-vcf = "0.80.0"
129132
num-traits = "0.2.19"
130133
num_enum = { version = "0.7.3", default-features = false }
131134
object_store = { version = "0.12.3", default-features = false }
@@ -180,6 +183,7 @@ tempfile = "3"
180183
thiserror = "2.0.3"
181184
tokio = { version = "1.46" }
182185
tokio-stream = "0.1.17"
186+
tokio-util = { version = "0.7.16" }
183187
# replace these with releases
184188
tpchgen = { git = "https://github.com/clflushopt/tpchgen-rs.git", rev = "d849ff430cd52250f6891ed4d5e3adad77bb2698" }
185189
tpchgen-arrow = { git = "https://github.com/clflushopt/tpchgen-rs.git", rev = "d849ff430cd52250f6891ed4d5e3adad77bb2698" }

bench-vortex/Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ clap = { workspace = true, features = ["derive"] }
2828
datafusion = { workspace = true, features = [
2929
"parquet",
3030
"datetime_expressions",
31+
"nested_expressions",
3132
] }
3233
datafusion-common = { workspace = true }
3334
datafusion-physical-plan = { workspace = true }
@@ -36,10 +37,12 @@ erased-serde = { workspace = true }
3637
futures = { workspace = true }
3738
glob = { workspace = true }
3839
humansize = { workspace = true }
39-
indicatif = { workspace = true }
40+
indicatif = { workspace = true, features = ["futures"] }
4041
itertools = { workspace = true }
4142
log = { workspace = true, features = ["max_level_debug"] }
4243
mimalloc = { workspace = true }
44+
noodles-bgzf = { workspace = true, features = ["async"] }
45+
noodles-vcf = { workspace = true, features = ["async"] }
4346
object_store = { workspace = true, features = ["aws", "gcp"] }
4447
opentelemetry = { workspace = true }
4548
opentelemetry-otlp = { workspace = true, features = ["trace"] }
@@ -60,6 +63,7 @@ tar = { workspace = true }
6063
target-lexicon = { workspace = true }
6164
tokio = { workspace = true, features = ["full"] }
6265
tokio-stream = { workspace = true }
66+
tokio-util = { workspace = true }
6367
tpchgen = { workspace = true }
6468
tpchgen-arrow = { workspace = true }
6569
tracing = { workspace = true }

bench-vortex/src/bin/query_bench.rs

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@ use std::path::PathBuf;
66
use bench_vortex::benchmark_driver::{DriverConfig, run_benchmark};
77
use bench_vortex::clickbench::{ClickBenchBenchmark, Flavor};
88
use bench_vortex::display::DisplayFormat;
9+
use bench_vortex::statpopgen::StatPopGenBenchmark;
910
use bench_vortex::tpcds::TpcDsBenchmark;
1011
use bench_vortex::tpch::tpch_benchmark::TpcHBenchmark;
11-
use bench_vortex::{Target, setup_logging_and_tracing};
12+
use bench_vortex::{IdempotentPath as _, Target, setup_logging_and_tracing};
1213
use clap::{Parser, Subcommand, value_parser};
14+
use url::Url;
1315

1416
#[derive(Parser, Debug)]
1517
#[command(version, about = "Vortex query benchmark runner", long_about = None)]
@@ -31,6 +33,10 @@ enum Commands {
3133
/// Run TPC-DS queries
3234
#[command(name = "tpcds")]
3335
TpcDS(TpcDSArgs),
36+
37+
/// Run Statisical & Population Genetics queries
38+
#[command(name = "statpopgen")]
39+
StatPopGen(StatPopGenArgs),
3440
}
3541

3642
/// Common arguments shared across benchmarks
@@ -153,6 +159,39 @@ struct TpcDSArgs {
153159
scale_factor: String,
154160
}
155161

162+
#[derive(Parser, Debug)]
163+
struct StatPopGenArgs {
164+
#[command(flatten)]
165+
common: CommonArgs,
166+
167+
#[arg(long, value_delimiter = ',', value_parser = value_parser!(Target),
168+
default_values = vec![
169+
// DataFusion does not support list_aggregate and simulating it with an UNNEST and GROUP
170+
// BY is _very_ slow.
171+
//
172+
// "datafusion:parquet",
173+
// "datafusion:vortex",
174+
"duckdb:parquet",
175+
"duckdb:vortex",
176+
// DuckDB vortex-compact files trigger an assertion in pcodec.
177+
//
178+
// "duckdb:vortex-compact",
179+
//
180+
// DuckDB native has a fixed parallelism row group size of 122,880
181+
// rows. Unfortunately, this kind of list-heavy dataset is almost perfectly
182+
// adversarial to that limitation.
183+
//
184+
// https://duckdb.org/docs/stable/guides/performance/how_to_tune_workloads.html#the-effect-of-row-groups-on-parallelism
185+
//
186+
// "duckdb:duckdb"
187+
]
188+
)]
189+
targets: Vec<Target>,
190+
191+
#[arg(long)]
192+
scale_factor: u64,
193+
}
194+
156195
fn validate_scale_factor(val: &str) -> Result<String, String> {
157196
match val.parse::<f32>() {
158197
Ok(n) if [0.01, 0.1, 1., 10., 100., 1000.].contains(&n) => {
@@ -180,6 +219,7 @@ fn main() -> anyhow::Result<()> {
180219
Commands::ClickBench(clickbench_args) => run_clickbench(clickbench_args),
181220
Commands::TpcH(tpch_args) => run_tpch(tpch_args),
182221
Commands::TpcDS(tpcds_args) => run_tpcds(tpcds_args),
222+
Commands::StatPopGen(stat_pop_gen_args) => run_statpopgen(stat_pop_gen_args),
183223
}
184224
}
185225

@@ -278,3 +318,34 @@ fn run_tpcds(args: TpcDSArgs) -> anyhow::Result<()> {
278318

279319
Ok(())
280320
}
321+
322+
fn run_statpopgen(args: StatPopGenArgs) -> anyhow::Result<()> {
323+
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;
324+
325+
// Create benchmark instance
326+
let data_url = Url::from_directory_path("statpopgen".to_data_path())
327+
.map_err(|_| anyhow::anyhow!("bad data path?"))?;
328+
let benchmark = StatPopGenBenchmark::new(data_url, args.scale_factor)?;
329+
330+
// Configure driver
331+
let config = DriverConfig {
332+
targets: args.targets,
333+
iterations: args.common.iterations,
334+
threads: args.common.threads,
335+
display_format: args.common.display_format,
336+
disable_datafusion_cache: args.common.disable_datafusion_cache,
337+
delete_duckdb_database: args.common.delete_duckdb_database,
338+
queries: args.common.queries,
339+
exclude_queries: args.common.exclude_queries,
340+
output_path: args.common.output_path,
341+
emit_plan: args.common.emit_plan,
342+
export_spans: args.common.export_spans,
343+
show_metrics: args.common.show_metrics,
344+
hide_progress_bar: args.common.hide_progress_bar,
345+
track_memory: args.common.track_memory,
346+
skip_generate: args.common.skip_generate,
347+
};
348+
349+
// Run benchmark using the trait system
350+
run_benchmark(benchmark, config)
351+
}

bench-vortex/src/datasets/file.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ pub async fn register_vortex_files(
107107
.await?;
108108
}
109109
BenchmarkDataset::PublicBi { .. } => todo!(),
110+
BenchmarkDataset::StatPopGen { .. } => todo!(),
110111
}
111112

112113
Ok(())
@@ -154,6 +155,7 @@ pub async fn register_vortex_compact_files(
154155
.await?;
155156
}
156157
BenchmarkDataset::PublicBi { .. } => todo!(),
158+
BenchmarkDataset::StatPopGen { .. } => todo!(),
157159
}
158160

159161
Ok(())

bench-vortex/src/datasets/mod.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use url::Url;
1111
use vortex::ArrayRef;
1212

1313
use crate::clickbench::Flavor;
14-
use crate::{Format, clickbench};
14+
use crate::{Format, clickbench, statpopgen};
1515

1616
pub mod data_downloads;
1717
pub mod file;
@@ -36,6 +36,8 @@ pub enum BenchmarkDataset {
3636
ClickBench { flavor: Flavor },
3737
#[serde(rename = "public-bi")]
3838
PublicBi { name: String },
39+
#[serde(rename = "statpopgen")]
40+
StatPopGen { n_rows: u64 },
3941
}
4042

4143
impl BenchmarkDataset {
@@ -45,6 +47,7 @@ impl BenchmarkDataset {
4547
BenchmarkDataset::TpcDS { .. } => "tpcds",
4648
BenchmarkDataset::ClickBench { .. } => "clickbench",
4749
BenchmarkDataset::PublicBi { .. } => "public-bi",
50+
BenchmarkDataset::StatPopGen { .. } => "statpopgen",
4851
}
4952
}
5053
}
@@ -59,6 +62,7 @@ impl Display for BenchmarkDataset {
5962
Flavor::Single => write!(f, "clickbench-single"),
6063
},
6164
BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"),
65+
BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"),
6266
}
6367
}
6468
}
@@ -92,13 +96,12 @@ impl BenchmarkDataset {
9296
"time_dim",
9397
"web_returns",
9498
],
95-
9699
BenchmarkDataset::TpcH { .. } => &[
97100
"customer", "lineitem", "nation", "orders", "part", "partsupp", "region",
98101
"supplier",
99102
],
100-
101103
BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::PublicBi { .. } => todo!(),
104+
BenchmarkDataset::StatPopGen { .. } => &["statpopgen"],
102105
}
103106
}
104107

@@ -141,6 +144,15 @@ impl BenchmarkDataset {
141144
(BenchmarkDataset::PublicBi { .. }, _) => {
142145
anyhow::bail!("public bi unsupported for now")
143146
}
147+
(BenchmarkDataset::StatPopGen { .. }, Format::Parquet) => {
148+
statpopgen::register_table(session, base_url, Format::Parquet).await?
149+
}
150+
(BenchmarkDataset::StatPopGen { .. }, Format::OnDiskVortex) => {
151+
statpopgen::register_table(session, base_url, Format::OnDiskVortex).await?
152+
}
153+
(BenchmarkDataset::StatPopGen { .. }, format) => {
154+
anyhow::bail!("StatPopGen in {format} unsupported in DataFusion")
155+
}
144156
}
145157

146158
Ok(())

0 commit comments

Comments
 (0)