Skip to content

Commit 226afc6

Browse files
authored
Add H2O.ai Database-like Ops benchmark to dfbench (groupby support) (#13996)
* Add H2O.ai Database-like Ops benchmark to dfbench * Fix query and fmt * Change venv * Make sure venv version support falsa * Fix default path * Support groupby only now * fix * Address comments * fix * support python version higher * support higer python such as python 3.13 * Addressed new comments * Add specific query example
1 parent 17446ad commit 226afc6

File tree

8 files changed

+379
-154
lines changed

8 files changed

+379
-154
lines changed

benchmarks/README.md

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ DataFusion is included in the benchmark setups for several popular
3232
benchmarks that compare performance with other engines. For example:
3333

3434
* [ClickBench] scripts are in the [ClickBench repo](https://github.com/ClickHouse/ClickBench/tree/main/datafusion)
35-
* [H2o.ai `db-benchmark`] scripts are in [db-benchmark](db-benchmark) directory
35+
* [H2o.ai `db-benchmark`] scripts are in [db-benchmark](https://github.com/apache/datafusion/tree/main/benchmarks/src/h2o.rs)
3636

3737
[ClickBench]: https://github.com/ClickHouse/ClickBench/tree/main
3838
[H2o.ai `db-benchmark`]: https://github.com/h2oai/db-benchmark
@@ -405,31 +405,50 @@ cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '...
405405
```
406406

407407

408-
# Older Benchmarks
408+
## h2o benchmarks for groupby
409409

410-
## h2o benchmarks
410+
### Generate data for h2o benchmarks
411+
There are three options for generating data for h2o benchmarks: `small`, `medium`, and `big`. The data is generated in the `data` directory.
411412

413+
1. Generate small data (1e7 rows)
412414
```bash
413-
cargo run --release --bin h2o group-by --query 1 --path /mnt/bigdata/h2oai/N_1e7_K_1e2_single.csv --mem-table --debug
415+
./bench.sh data h2o_small
414416
```
415417

416-
Example run:
417418

419+
2. Generate medium data (1e8 rows)
420+
```bash
421+
./bench.sh data h2o_medium
422+
```
423+
424+
425+
3. Generate large data (1e9 rows)
426+
```bash
427+
./bench.sh data h2o_big
428+
```
429+
430+
### Run h2o benchmarks
431+
There are three options for running h2o benchmarks: `small`, `medium`, and `big`.
432+
1. Run small data benchmark
433+
```bash
434+
./bench.sh run h2o_small
418435
```
419-
Running benchmarks with the following options: GroupBy(GroupBy { query: 1, path: "/mnt/bigdata/h2oai/N_1e7_K_1e2_single.csv", debug: false })
420-
Executing select id1, sum(v1) as v1 from x group by id1
421-
+-------+--------+
422-
| id1 | v1 |
423-
+-------+--------+
424-
| id063 | 199420 |
425-
| id094 | 200127 |
426-
| id044 | 198886 |
427-
...
428-
| id093 | 200132 |
429-
| id003 | 199047 |
430-
+-------+--------+
431436

432-
h2o groupby query 1 took 1669 ms
437+
2. Run medium data benchmark
438+
```bash
439+
./bench.sh run h2o_medium
440+
```
441+
442+
3. Run large data benchmark
443+
```bash
444+
./bench.sh run h2o_big
445+
```
446+
447+
4. Run a specific query with a specific data path
448+
449+
For example, to run query 1 with the small data generated above:
450+
```bash
451+
cargo run --release --bin dfbench -- h2o --path ./benchmarks/data/h2o/G1_1e7_1e7_100_0.csv --query 1
433452
```
434453

435454
[1]: http://www.tpc.org/tpch/

benchmarks/bench.sh

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ clickbench_1: ClickBench queries against a single parquet file
8080
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
8181
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
8282
external_aggr: External aggregation benchmark
83+
h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
84+
h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
85+
h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
8386
8487
**********
8588
* Supported Configuration (Environment Variables)
@@ -142,6 +145,9 @@ main() {
142145
all)
143146
data_tpch "1"
144147
data_tpch "10"
148+
data_h2o "SMALL"
149+
data_h2o "MEDIUM"
150+
data_h2o "BIG"
145151
data_clickbench_1
146152
data_clickbench_partitioned
147153
data_imdb
@@ -172,6 +178,15 @@ main() {
172178
imdb)
173179
data_imdb
174180
;;
181+
h2o_small)
182+
data_h2o "SMALL" "CSV"
183+
;;
184+
h2o_medium)
185+
data_h2o "MEDIUM" "CSV"
186+
;;
187+
h2o_big)
188+
data_h2o "BIG" "CSV"
189+
;;
175190
external_aggr)
176191
# same data as for tpch
177192
data_tpch "1"
@@ -221,6 +236,9 @@ main() {
221236
run_clickbench_1
222237
run_clickbench_partitioned
223238
run_clickbench_extended
239+
run_h2o "SMALL" "PARQUET" "groupby"
240+
run_h2o "MEDIUM" "PARQUET" "groupby"
241+
run_h2o "BIG" "PARQUET" "groupby"
224242
run_imdb
225243
run_external_aggr
226244
;;
@@ -254,6 +272,15 @@ main() {
254272
imdb)
255273
run_imdb
256274
;;
275+
h2o_small)
276+
run_h2o "SMALL" "CSV" "groupby"
277+
;;
278+
h2o_medium)
279+
run_h2o "MEDIUM" "CSV" "groupby"
280+
;;
281+
h2o_big)
282+
run_h2o "BIG" "CSV" "groupby"
283+
;;
257284
external_aggr)
258285
run_external_aggr
259286
;;
@@ -541,6 +568,125 @@ run_imdb() {
541568
$CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
542569
}
543570

571+
data_h2o() {
572+
# Default values for size and data format
573+
SIZE=${1:-"SMALL"}
574+
DATA_FORMAT=${2:-"CSV"}
575+
576+
# Function to compare Python versions
577+
version_ge() {
578+
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
579+
}
580+
581+
export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
582+
583+
# Find the highest available Python version (3.10 or higher)
584+
REQUIRED_VERSION="3.10"
585+
PYTHON_CMD=$(command -v python3 || true)
586+
587+
if [ -n "$PYTHON_CMD" ]; then
588+
PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
589+
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
590+
echo "Found Python version $PYTHON_VERSION, which is suitable."
591+
else
592+
echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
593+
PYTHON_CMD=""
594+
fi
595+
fi
596+
597+
# Search for suitable Python versions if the default is unsuitable
598+
if [ -z "$PYTHON_CMD" ]; then
599+
# Loop through all available Python3 commands on the system
600+
for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
601+
if command -v "$CMD" &> /dev/null; then
602+
PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
603+
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
604+
PYTHON_CMD="$CMD"
605+
echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
606+
break
607+
fi
608+
fi
609+
done
610+
fi
611+
612+
# If no suitable Python version found, exit with an error
613+
if [ -z "$PYTHON_CMD" ]; then
614+
echo "Python 3.10 or higher is required. Please install it."
615+
return 1
616+
fi
617+
618+
echo "Using Python command: $PYTHON_CMD"
619+
620+
# Install falsa and other dependencies
621+
echo "Installing falsa..."
622+
623+
# Set virtual environment directory
624+
VIRTUAL_ENV="${PWD}/venv"
625+
626+
# Create a virtual environment using the detected Python command
627+
$PYTHON_CMD -m venv "$VIRTUAL_ENV"
628+
629+
# Activate the virtual environment and install dependencies
630+
source "$VIRTUAL_ENV/bin/activate"
631+
632+
# Ensure 'falsa' is installed (avoid unnecessary reinstall)
633+
pip install --quiet --upgrade falsa
634+
635+
# Create directory if it doesn't exist
636+
H2O_DIR="${DATA_DIR}/h2o"
637+
mkdir -p "${H2O_DIR}"
638+
639+
# Generate h2o test data
640+
echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
641+
falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
642+
643+
# Deactivate virtual environment after completion
644+
deactivate
645+
}
646+
647+
## todo now only support groupby, after https://github.com/mrpowers-io/falsa/issues/21 done, we can add support for join
648+
run_h2o() {
649+
# Default values for size and data format
650+
SIZE=${1:-"SMALL"}
651+
DATA_FORMAT=${2:-"CSV"}
652+
DATA_FORMAT=$(echo "$DATA_FORMAT" | tr '[:upper:]' '[:lower:]')
653+
RUN_Type=${3:-"groupby"}
654+
655+
# Data directory and results file path
656+
H2O_DIR="${DATA_DIR}/h2o"
657+
RESULTS_FILE="${RESULTS_DIR}/h2o.json"
658+
659+
echo "RESULTS_FILE: ${RESULTS_FILE}"
660+
echo "Running h2o benchmark..."
661+
662+
# Set the file name based on the size
663+
case "$SIZE" in
664+
"SMALL")
665+
FILE_NAME="G1_1e7_1e7_100_0.${DATA_FORMAT}" # For small dataset
666+
;;
667+
"MEDIUM")
668+
FILE_NAME="G1_1e8_1e8_100_0.${DATA_FORMAT}" # For medium dataset
669+
;;
670+
"BIG")
671+
FILE_NAME="G1_1e9_1e9_100_0.${DATA_FORMAT}" # For big dataset
672+
;;
673+
*)
674+
echo "Invalid size. Valid options are SMALL, MEDIUM, or BIG."
675+
return 1
676+
;;
677+
esac
678+
679+
# Set the query file name based on the RUN_Type
680+
QUERY_FILE="${SCRIPT_DIR}/queries/h2o/${RUN_Type}.sql"
681+
682+
# Run the benchmark using the dynamically constructed file path and query file
683+
$CARGO_COMMAND --bin dfbench -- h2o \
684+
--iterations 3 \
685+
--path "${H2O_DIR}/${FILE_NAME}" \
686+
--queries-path "${QUERY_FILE}" \
687+
-o "${RESULTS_FILE}"
688+
}
689+
544690
# Runs the external aggregation benchmark
545691
run_external_aggr() {
546692
# Use TPC-H SF1 dataset

benchmarks/queries/h2o/groupby.sql

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1;
2+
SELECT id1, id2, SUM(v1) AS v1 FROM x GROUP BY id1, id2;
3+
SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3;
4+
SELECT id4, AVG(v1) AS v1, AVG(v2) AS v2, AVG(v3) AS v3 FROM x GROUP BY id4;
5+
SELECT id6, SUM(v1) AS v1, SUM(v2) AS v2, SUM(v3) AS v3 FROM x GROUP BY id6;
6+
SELECT id4, id5, MEDIAN(v3) AS median_v3, STDDEV(v3) AS sd_v3 FROM x GROUP BY id4, id5;
7+
SELECT id3, MAX(v1) - MIN(v2) AS range_v1_v2 FROM x GROUP BY id3;
8+
SELECT id6, largest2_v3 FROM (SELECT id6, v3 AS largest2_v3, ROW_NUMBER() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS order_v3 FROM x WHERE v3 IS NOT NULL) sub_query WHERE order_v3 <= 2;
9+
SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM x GROUP BY id2, id4;
10+
SELECT id1, id2, id3, id4, id5, id6, SUM(v3) AS v3, COUNT(*) AS count FROM x GROUP BY id1, id2, id3, id4, id5, id6;

benchmarks/queries/h2o/join.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
SELECT x.id1, x.id2, x.id3, x.id4 as xid4, small.id4 as smallid4, x.id5, x.id6, x.v1, small.v2 FROM x INNER JOIN small ON x.id1 = small.id1;
2+
SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x INNER JOIN medium ON x.id2 = medium.id2;
3+
SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x LEFT JOIN medium ON x.id2 = medium.id2;
4+
SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x JOIN medium ON x.id5 = medium.id5;
5+
SELECT x.id1 as xid1, large.id1 as largeid1, x.id2 as xid2, large.id2 as largeid2, x.id3, x.id4 as xid4, large.id4 as largeid4, x.id5 as xid5, large.id5 as largeid5, x.id6 as xid6, large.id6 as largeid6, x.v1, large.v2 FROM x JOIN large ON x.id3 = large.id3;

benchmarks/src/bin/dfbench.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
3333
#[global_allocator]
3434
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
3535

36-
use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, sort_tpch, tpch};
36+
use datafusion_benchmarks::{
37+
clickbench, h2o, imdb, parquet_filter, sort, sort_tpch, tpch,
38+
};
3739

3840
#[derive(Debug, StructOpt)]
3941
#[structopt(about = "benchmark command")]
@@ -45,6 +47,7 @@ enum Options {
4547
Sort(sort::RunOpt),
4648
SortTpch(sort_tpch::RunOpt),
4749
Imdb(imdb::RunOpt),
50+
H2o(h2o::RunOpt),
4851
}
4952

5053
// Main benchmark runner entrypoint
@@ -60,5 +63,6 @@ pub async fn main() -> Result<()> {
6063
Options::Sort(opt) => opt.run().await,
6164
Options::SortTpch(opt) => opt.run().await,
6265
Options::Imdb(opt) => opt.run().await,
66+
Options::H2o(opt) => opt.run().await,
6367
}
6468
}

0 commit comments

Comments
 (0)