Skip to content

Commit b351f77

Browse files
authored
feat(bench): add new benchmarking script, harness, and profiling guide (#3840)
# Description This redoes the merge-based benchmark in crates/benchmark, replacing it with `divan` as a real harness combined with adding a script that can be used for profiling. # Related Issue(s) Closes #3839 # Documentation Documentation is included in the updated README --------- Signed-off-by: Abhi Agarwal <[email protected]>
1 parent 4acc60b commit b351f77

File tree

14 files changed

+422
-718
lines changed

14 files changed

+422
-718
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ __blobstorage__
2222
.githubchangeloggenerator.cache.log
2323
.githubchangeloggenerator.cache/
2424
.githubchangeloggenerator*
25-
data
2625
.zed/
2726

2827
# Add all Cargo.lock files except for those in binary crates

Cargo.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ regex = { version = "1" }
6666
thiserror = { version = "2" }
6767
url = { version = "2" }
6868
percent-encoding-rfc3986 = { version = "0.1.3" }
69+
tempfile = { version = "3" }
6970
uuid = { version = "1" }
7071

7172
# runtime / async
@@ -101,3 +102,11 @@ Arro3 = "Arro3"
101102
AKS = "AKS"
102103
# to avoid using 'type' as a field name.
103104
tpe = "tpe"
105+
106+
# for better flamegraphs when benchmarking
107+
[profile.bench]
108+
debug = true
109+
110+
[profile.profiling]
111+
inherits = "release"
112+
debug = true

crates/benchmarks/Cargo.toml

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,22 @@ license = "Apache-2.0"
77
keywords = ["deltalake", "delta", "datalake"]
88
description = "Delta-rs Benchmarks"
99
edition = "2021"
10+
publish = false
1011

1112
[dependencies]
12-
clap = { version = "4", features = [ "derive" ] }
13-
chrono = { version = "0.4.31", default-features = false, features = ["clock"] }
14-
tokio = { version = "1", features = ["fs", "macros", "rt", "io-util"] }
15-
16-
# arrow
17-
arrow = { workspace = true }
18-
arrow-array = { workspace = true }
19-
20-
# serde
21-
serde_json = { workspace = true }
22-
23-
# datafusion
24-
datafusion = { workspace = true }
13+
clap = { version = "4", features = ["derive"] }
14+
tokio = { workspace = true, features = ["fs", "macros", "rt", "io-util"] }
15+
url = { workspace = true }
16+
tempfile = { workspace = true }
2517

2618
[dependencies.deltalake-core]
2719
path = "../core"
2820
version = "0"
2921
features = ["datafusion"]
22+
23+
[dev-dependencies]
24+
divan = "0.1"
25+
26+
[[bench]]
27+
name = "merge"
28+
harness = false

crates/benchmarks/README.md

Lines changed: 47 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,52 +4,66 @@ The merge benchmarks are similar to the ones used by [Delta Spark](https://githu
44

55
## Dataset
66

7-
Databricks maintains a public S3 bucket of the TPC-DS dataset with various factor where requesters must pay to download this dataset. Below is an example of how to list the 1gb scale factor
7+
To generate the database, `duckdb` can be used. Install `duckdb` by following [these instructions](https://duckdb.org/#quickinstall).
88

9+
Run the following commands:
10+
11+
```bash
12+
❯ duckdb
13+
D CALL dsdgen(sf = 1);
14+
100% ▕██████████████████████████████████████▏ (00:00:05.76 elapsed)
15+
┌─────────┐
16+
│ Success │
17+
│ boolean │
18+
├─────────┤
19+
│ 0 rows │
20+
└─────────┘
21+
D EXPORT DATABASE 'tpcds_parquet' (FORMAT PARQUET);
922
```
10-
aws s3api list-objects --bucket devrel-delta-datasets --request-payer requester --prefix tpcds-2.13/tpcds_sf1_parquet/web_returns/
11-
```
1223

13-
You can generate the TPC-DS dataset yourself by downloading and compiling [the generator](https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp)
14-
You may need to update the CFLAGS to include `-fcommon` to compile on newer versions of GCC.
24+
This will generate a folder called `tpcds_parquet` containing many parquet files. Place it at `crates/benchmarks/data/tpcds_parquet` (or set `TPCDS_PARQUET_DIR`). Credits to [Xuanwo's Blog](https://xuanwo.io/links/2025/02/duckdb-is-the-best-tpc-data-generator/).
25+
26+
## Running benchmarks
1527

16-
## Commands
17-
These commands can be executed from the root of the benchmark crate. Some commands depend on the existence of the TPC-DS Dataset existing.
28+
Benchmarks use Divan and time only the merge operation. A temporary Delta table is created per iteration from `web_returns.parquet` and removed afterwards.
1829

19-
### Convert
20-
Converts a TPC-DS web_returns csv into a Delta table
21-
Assumes the dataset is pipe delimited and records do not have a trailing delimiter
30+
Environment variables:
31+
- `TPCDS_PARQUET_DIR` (optional): directory containing `web_returns.parquet`. Default: `crates/benchmarks/data/tpcds_parquet`.
2232

33+
From the repo root:
2334
```
24-
cargo run --release --bin merge -- convert data/tpcds/web_returns.dat data/web_returns
35+
cargo bench -p delta-benchmarks --bench merge
2536
```
2637

27-
### Standard
28-
Execute the standard merge bench suite.
29-
Results can be saved to a delta table for further analysis.
30-
This table has the following schema:
31-
32-
group_id: Used to group all tests that executed as a part of this call. Default value is the timestamp of execution
33-
name: The benchmark name that was executed
34-
sample: The iteration number for a given benchmark name
35-
duration_ms: How long the benchmark took in ms
36-
data: Free field to pack any additional data
37-
38+
Filter a specific suite:
3839
```
39-
cargo run --release --bin merge -- standard data/web_returns 1 data/merge_results
40+
cargo bench -p delta-benchmarks --bench merge -- delete_only
41+
cargo bench -p delta-benchmarks --bench merge -- multiple_insert_only
42+
cargo bench -p delta-benchmarks --bench merge -- upsert_file_matched
4043
```
4144

42-
### Compare
43-
Compare the results of two different runs.
44-
The a Delta table paths and the `group_id` of each run and obtain the speedup for each test case
45+
## Profiling script
4546

46-
```
47-
cargo run --release --bin merge -- compare data/benchmarks/ 1698636172801 data/benchmarks/ 1699759539902
47+
A simple CLI is available to run a single merge with configurable parameters (useful for profiling or ad-hoc runs). It creates a fresh temporary Delta table per sample from `web_returns.parquet`, times only the merge, and prints duration and metrics.
48+
49+
Run (from repo root):
50+
```bash
51+
cargo run --profile profiling -p delta-benchmarks -- upsert --matched 0.01 --not-matched 0.10
4852
```
4953

50-
### Show
51-
Show all benchmarks results from a delta table
54+
Options:
55+
- `upsert | delete | insert`: operation to benchmark
56+
- `--matched <fraction>`: fraction of rows that match existing keys (default 0.01)
57+
- `--not-matched <fraction>`: fraction of rows that do not match (default 0.10)
5258

53-
```
54-
cargo run --release --bin merge -- show data/benchmark
55-
```
59+
### Flamegraphs using `samply`
60+
61+
Using `samply`, you can generate flamegraphs from the profile script.
62+
63+
To start,
64+
65+
```bash
66+
cargo install samply --locked
67+
cargo build --profile profiling -p delta-benchmarks
68+
samply record ./target/profiling/delta-benchmarks upsert
69+
```

crates/benchmarks/benches/merge.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
use std::path::PathBuf;
2+
3+
use delta_benchmarks::{
4+
merge_delete, merge_insert, merge_upsert, prepare_source_and_table, MergeOp, MergePerfParams,
5+
};
6+
7+
use divan::{AllocProfiler, Bencher};
8+
9+
fn main() {
10+
divan::main();
11+
}
12+
13+
#[global_allocator]
14+
static ALLOC: AllocProfiler = AllocProfiler::system();
15+
16+
fn bench_merge(bencher: Bencher, op: MergeOp, params: &MergePerfParams) {
17+
let rt = tokio::runtime::Runtime::new().unwrap();
18+
bencher
19+
.with_inputs(|| {
20+
let tmp_dir = tempfile::tempdir().unwrap();
21+
let parquet_dir = PathBuf::from(
22+
std::env::var("TPCDS_PARQUET_DIR")
23+
.unwrap_or_else(|_| "data/tpcds_parquet".to_string()),
24+
);
25+
rt.block_on(async move {
26+
let (source, table) = prepare_source_and_table(params, &tmp_dir, &parquet_dir)
27+
.await
28+
.unwrap();
29+
(source, table, tmp_dir)
30+
})
31+
})
32+
.bench_local_values(|(source, table, tmp_dir)| {
33+
rt.block_on(async move {
34+
let _ = divan::black_box(op(source, table).unwrap().await.unwrap());
35+
});
36+
drop(tmp_dir);
37+
});
38+
}
39+
40+
#[divan::bench(args = [
41+
MergePerfParams {
42+
sample_matched_rows: 0.05,
43+
sample_not_matched_rows: 0.0,
44+
}
45+
])]
46+
fn delete_only(bencher: Bencher, params: &MergePerfParams) {
47+
bench_merge(bencher, merge_delete, params);
48+
}
49+
50+
#[divan::bench(args = [
51+
MergePerfParams {
52+
sample_matched_rows: 0.00,
53+
sample_not_matched_rows: 0.05,
54+
},
55+
MergePerfParams {
56+
sample_matched_rows: 0.00,
57+
sample_not_matched_rows: 0.50,
58+
},
59+
MergePerfParams {
60+
sample_matched_rows: 0.00,
61+
sample_not_matched_rows: 1.0,
62+
},
63+
])]
64+
fn multiple_insert_only(bencher: Bencher, params: &MergePerfParams) {
65+
bench_merge(bencher, merge_insert, params);
66+
}
67+
68+
#[divan::bench(args = [
69+
MergePerfParams {
70+
sample_matched_rows: 0.01,
71+
sample_not_matched_rows: 0.1,
72+
},
73+
MergePerfParams {
74+
sample_matched_rows: 0.1,
75+
sample_not_matched_rows: 0.0,
76+
},
77+
MergePerfParams {
78+
sample_matched_rows: 0.1,
79+
sample_not_matched_rows: 0.01,
80+
},
81+
MergePerfParams {
82+
sample_matched_rows: 0.5,
83+
sample_not_matched_rows: 0.001,
84+
},
85+
MergePerfParams {
86+
sample_matched_rows: 0.99,
87+
sample_not_matched_rows: 0.001,
88+
},
89+
MergePerfParams {
90+
sample_matched_rows: 0.001,
91+
sample_not_matched_rows: 0.001,
92+
},
93+
])]
94+
fn upsert_file_matched(bencher: Bencher, params: &MergePerfParams) {
95+
bench_merge(bencher, merge_upsert, params);
96+
}

crates/benchmarks/data/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore

0 commit comments

Comments
 (0)