@@ -80,6 +80,9 @@ clickbench_1: ClickBench queries against a single parquet file
8080clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
8181clickbench_extended: ClickBench \" inspired\" queries against a single parquet (DataFusion specific)
8282external_aggr: External aggregation benchmark
83+ h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
84+ h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
85+ h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
8386
8487**********
8588* Supported Configuration (Environment Variables)
@@ -142,6 +145,9 @@ main() {
142145 all)
143146 data_tpch " 1"
144147 data_tpch " 10"
148+ data_h2o " SMALL"
149+ data_h2o " MEDIUM"
150+ data_h2o " BIG"
145151 data_clickbench_1
146152 data_clickbench_partitioned
147153 data_imdb
@@ -172,6 +178,15 @@ main() {
172178 imdb)
173179 data_imdb
174180 ;;
181+ h2o_small)
182+ data_h2o " SMALL" " CSV"
183+ ;;
184+ h2o_medium)
185+ data_h2o " MEDIUM" " CSV"
186+ ;;
187+ h2o_big)
188+ data_h2o " BIG" " CSV"
189+ ;;
175190 external_aggr)
176191 # same data as for tpch
177192 data_tpch " 1"
@@ -221,6 +236,9 @@ main() {
221236 run_clickbench_1
222237 run_clickbench_partitioned
223238 run_clickbench_extended
239+ run_h2o " SMALL" " PARQUET" " groupby"
240+ run_h2o " MEDIUM" " PARQUET" " groupby"
241+ run_h2o " BIG" " PARQUET" " groupby"
224242 run_imdb
225243 run_external_aggr
226244 ;;
@@ -254,6 +272,15 @@ main() {
254272 imdb)
255273 run_imdb
256274 ;;
275+ h2o_small)
276+ run_h2o " SMALL" " CSV" " groupby"
277+ ;;
278+ h2o_medium)
279+ run_h2o " MEDIUM" " CSV" " groupby"
280+ ;;
281+ h2o_big)
282+ run_h2o " BIG" " CSV" " groupby"
283+ ;;
257284 external_aggr)
258285 run_external_aggr
259286 ;;
@@ -541,6 +568,125 @@ run_imdb() {
541568 $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path " ${IMDB_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format parquet -o " ${RESULTS_FILE} "
542569}
543570
571+ data_h2o () {
572+ # Default values for size and data format
573+ SIZE=${1:- " SMALL" }
574+ DATA_FORMAT=${2:- " CSV" }
575+
576+ # Function to compare Python versions
577+ version_ge () {
578+ [ " $( printf ' %s\n' " $1 " " $2 " | sort -V | head -n1) " = " $2 " ]
579+ }
580+
581+ export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
582+
583+ # Find the highest available Python version (3.10 or higher)
584+ REQUIRED_VERSION=" 3.10"
585+ PYTHON_CMD=$( command -v python3 || true)
586+
587+ if [ -n " $PYTHON_CMD " ]; then
588+ PYTHON_VERSION=$( $PYTHON_CMD -c " import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" )
589+ if version_ge " $PYTHON_VERSION " " $REQUIRED_VERSION " ; then
590+ echo " Found Python version $PYTHON_VERSION , which is suitable."
591+ else
592+ echo " Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
593+ PYTHON_CMD=" "
594+ fi
595+ fi
596+
597+ # Search for suitable Python versions if the default is unsuitable
598+ if [ -z " $PYTHON_CMD " ]; then
599+ # Loop through all available Python3 commands on the system
600+ for CMD in $( compgen -c | grep -E ' ^python3(\.[0-9]+)?$' ) ; do
601+ if command -v " $CMD " & > /dev/null; then
602+ PYTHON_VERSION=$( $CMD -c " import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" )
603+ if version_ge " $PYTHON_VERSION " " $REQUIRED_VERSION " ; then
604+ PYTHON_CMD=" $CMD "
605+ echo " Found suitable Python version: $PYTHON_VERSION ($CMD )"
606+ break
607+ fi
608+ fi
609+ done
610+ fi
611+
612+ # If no suitable Python version found, exit with an error
613+ if [ -z " $PYTHON_CMD " ]; then
614+ echo " Python 3.10 or higher is required. Please install it."
615+ return 1
616+ fi
617+
618+ echo " Using Python command: $PYTHON_CMD "
619+
620+ # Install falsa and other dependencies
621+ echo " Installing falsa..."
622+
623+ # Set virtual environment directory
624+ VIRTUAL_ENV=" ${PWD} /venv"
625+
626+ # Create a virtual environment using the detected Python command
627+ $PYTHON_CMD -m venv " $VIRTUAL_ENV "
628+
629+ # Activate the virtual environment and install dependencies
630+ source " $VIRTUAL_ENV /bin/activate"
631+
632+ # Ensure 'falsa' is installed (avoid unnecessary reinstall)
633+ pip install --quiet --upgrade falsa
634+
635+ # Create directory if it doesn't exist
636+ H2O_DIR=" ${DATA_DIR} /h2o"
637+ mkdir -p " ${H2O_DIR} "
638+
639+ # Generate h2o test data
640+ echo " Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT} "
641+ falsa groupby --path-prefix=" ${H2O_DIR} " --size " ${SIZE} " --data-format " ${DATA_FORMAT} "
642+
643+ # Deactivate virtual environment after completion
644+ deactivate
645+ }
646+
647+ # # todo now only support groupby, after https://github.com/mrpowers-io/falsa/issues/21 done, we can add support for join
648+ run_h2o () {
649+ # Default values for size and data format
650+ SIZE=${1:- " SMALL" }
651+ DATA_FORMAT=${2:- " CSV" }
652+ DATA_FORMAT=$( echo " $DATA_FORMAT " | tr ' [:upper:]' ' [:lower:]' )
653+ RUN_Type=${3:- " groupby" }
654+
655+ # Data directory and results file path
656+ H2O_DIR=" ${DATA_DIR} /h2o"
657+ RESULTS_FILE=" ${RESULTS_DIR} /h2o.json"
658+
659+ echo " RESULTS_FILE: ${RESULTS_FILE} "
660+ echo " Running h2o benchmark..."
661+
662+ # Set the file name based on the size
663+ case " $SIZE " in
664+ " SMALL" )
665+ FILE_NAME=" G1_1e7_1e7_100_0.${DATA_FORMAT} " # For small dataset
666+ ;;
667+ " MEDIUM" )
668+ FILE_NAME=" G1_1e8_1e8_100_0.${DATA_FORMAT} " # For medium dataset
669+ ;;
670+ " BIG" )
671+ FILE_NAME=" G1_1e9_1e9_100_0.${DATA_FORMAT} " # For big dataset
672+ ;;
673+ * )
674+ echo " Invalid size. Valid options are SMALL, MEDIUM, or BIG."
675+ return 1
676+ ;;
677+ esac
678+
679+ # Set the query file name based on the RUN_Type
680+ QUERY_FILE=" ${SCRIPT_DIR} /queries/h2o/${RUN_Type} .sql"
681+
682+ # Run the benchmark using the dynamically constructed file path and query file
683+ $CARGO_COMMAND --bin dfbench -- h2o \
684+ --iterations 3 \
685+ --path " ${H2O_DIR} /${FILE_NAME} " \
686+ --queries-path " ${QUERY_FILE} " \
687+ -o " ${RESULTS_FILE} "
688+ }
689+
544690# Runs the external aggregation benchmark
545691run_external_aggr () {
546692 # Use TPC-H SF1 dataset
0 commit comments