Skip to content

Commit 7df3e5c

Browse files
authored
Add benchmark for memory-limited aggregation (#13090)
* Adding benchmark for external aggregation * comments
1 parent d2511b2 commit 7df3e5c

File tree

4 files changed

+450
-5
lines changed

4 files changed

+450
-5
lines changed

benchmarks/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,34 @@ This benchmarks is derived from the [TPC-H][1] version
352352
[2]: https://github.com/databricks/tpch-dbgen.git,
353353
[2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
354354

355+
## External Aggregation
356+
357+
Run the benchmark for aggregations with limited memory.
358+
359+
When the memory limit is exceeded, the aggregation intermediate results will be spilled to disk, and finally read back with sort-merge.
360+
361+
External aggregation benchmarks run several aggregation queries with different memory limits, on TPCH `lineitem` table. Queries can be found in [`external_aggr.rs`](src/bin/external_aggr.rs).
362+
363+
This benchmark is inspired by [DuckDB's external aggregation paper](https://hannes.muehleisen.org/publications/icde2024-out-of-core-kuiper-boncz-muehleisen.pdf), specifically Section VI.
364+
365+
### External Aggregation Example Runs
366+
1. Run all queries with predefined memory limits:
367+
```bash
368+
# Under 'benchmarks/' directory
369+
cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json'
370+
```
371+
372+
2. Run a query with specific memory limit:
373+
```bash
374+
cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json' --query 1 --memory-limit 30M
375+
```
376+
377+
3. Run all queries with `bench.sh` script:
378+
```bash
379+
./bench.sh data external_aggr
380+
./bench.sh run external_aggr
381+
```
382+
355383

356384
# Older Benchmarks
357385

benchmarks/bench.sh

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ sort: Benchmark of sorting speed
7878
clickbench_1: ClickBench queries against a single parquet file
7979
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
8080
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
81+
external_aggr: External aggregation benchmark
8182
8283
**********
8384
* Supported Configuration (Environment Variables)
@@ -170,6 +171,10 @@ main() {
170171
imdb)
171172
data_imdb
172173
;;
174+
external_aggr)
175+
# same data as for tpch
176+
data_tpch "1"
177+
;;
173178
*)
174179
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
175180
usage
@@ -212,6 +217,7 @@ main() {
212217
run_clickbench_partitioned
213218
run_clickbench_extended
214219
run_imdb
220+
run_external_aggr
215221
;;
216222
tpch)
217223
run_tpch "1"
@@ -243,6 +249,9 @@ main() {
243249
imdb)
244250
run_imdb
245251
;;
252+
external_aggr)
253+
run_external_aggr
254+
;;
246255
*)
247256
echo "Error: unknown benchmark '$BENCHMARK' for run"
248257
usage
@@ -524,7 +533,21 @@ run_imdb() {
524533
$CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
525534
}
526535

527-
536+
# Runs the external aggregation benchmark
537+
run_external_aggr() {
538+
# Use TPC-H SF1 dataset
539+
TPCH_DIR="${DATA_DIR}/tpch_sf1"
540+
RESULTS_FILE="${RESULTS_DIR}/external_aggr.json"
541+
echo "RESULTS_FILE: ${RESULTS_FILE}"
542+
echo "Running external aggregation benchmark..."
543+
544+
# Only parquet is supported.
545+
# Since per-operator memory limit is calculated as (total-memory-limit /
546+
# number-of-partitions), and by default `--partitions` is set to number of
547+
# CPU cores, we set a constant number of partitions to prevent this
548+
# benchmark to fail on some machines.
549+
$CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
550+
}
528551

529552

530553
compare_benchmarks() {

0 commit comments

Comments
 (0)