@@ -81,9 +81,12 @@ clickbench_1: ClickBench queries against a single parquet file
8181clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
8282clickbench_extended: ClickBench \" inspired\" queries against a single parquet (DataFusion specific)
8383external_aggr: External aggregation benchmark
84- h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
85- h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
86- h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
84+ h2o_small: h2oai benchmark with small dataset (1e7 rows) for groupby, default file format is csv
85+ h2o_medium: h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
86+ h2o_big: h2oai benchmark with large dataset (1e9 rows) for groupby, default file format is csv
87+ h2o_small_join: h2oai benchmark with small dataset (1e7 rows) for join, default file format is csv
88+ h2o_medium_join: h2oai benchmark with medium dataset (1e8 rows) for join, default file format is csv
89+ h2o_big_join: h2oai benchmark with large dataset (1e9 rows) for join, default file format is csv
8790imdb: Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
8891
8992**********
@@ -150,6 +153,9 @@ main() {
150153 data_h2o " SMALL"
151154 data_h2o " MEDIUM"
152155 data_h2o " BIG"
156+ data_h2o_join " SMALL"
157+ data_h2o_join " MEDIUM"
158+ data_h2o_join " BIG"
153159 data_clickbench_1
154160 data_clickbench_partitioned
155161 data_imdb
@@ -189,6 +195,15 @@ main() {
189195 h2o_big)
190196 data_h2o " BIG" " CSV"
191197 ;;
198+ h2o_small_join)
199+ data_h2o_join " SMALL" " CSV"
200+ ;;
201+ h2o_medium_join)
202+ data_h2o_join " MEDIUM" " CSV"
203+ ;;
204+ h2o_big_join)
205+ data_h2o_join " BIG" " CSV"
206+ ;;
192207 external_aggr)
193208 # same data as for tpch
194209 data_tpch " 1"
@@ -242,6 +257,9 @@ main() {
242257 run_h2o " SMALL" " PARQUET" " groupby"
243258 run_h2o " MEDIUM" " PARQUET" " groupby"
244259 run_h2o " BIG" " PARQUET" " groupby"
260+ run_h2o_join " SMALL" " PARQUET" " join"
261+ run_h2o_join " MEDIUM" " PARQUET" " join"
262+ run_h2o_join " BIG" " PARQUET" " join"
245263 run_imdb
246264 run_external_aggr
247265 ;;
@@ -287,6 +305,15 @@ main() {
287305 h2o_big)
288306 run_h2o " BIG" " CSV" " groupby"
289307 ;;
308+ h2o_small_join)
309+ run_h2o_join " SMALL" " CSV" " join"
310+ ;;
311+ h2o_medium_join)
312+ run_h2o_join " MEDIUM" " CSV" " join"
313+ ;;
314+ h2o_big_join)
315+ run_h2o_join " BIG" " CSV" " join"
316+ ;;
290317 external_aggr)
291318 run_external_aggr
292319 ;;
@@ -687,7 +714,82 @@ data_h2o() {
687714 deactivate
688715}
689716
690- # # todo now only support groupby, after https://github.com/mrpowers-io/falsa/issues/21 done, we can add support for join
717+ data_h2o_join () {
718+ # Default values for size and data format
719+ SIZE=${1:- " SMALL" }
720+ DATA_FORMAT=${2:- " CSV" }
721+
722+ # Function to compare Python versions
723+ version_ge () {
724+ [ " $( printf ' %s\n' " $1 " " $2 " | sort -V | head -n1) " = " $2 " ]
725+ }
726+
727+ export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
728+
729+ # Find the highest available Python version (3.10 or higher)
730+ REQUIRED_VERSION=" 3.10"
731+ PYTHON_CMD=$( command -v python3 || true)
732+
733+ if [ -n " $PYTHON_CMD " ]; then
734+ PYTHON_VERSION=$( $PYTHON_CMD -c " import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" )
735+ if version_ge " $PYTHON_VERSION " " $REQUIRED_VERSION " ; then
736+ echo " Found Python version $PYTHON_VERSION , which is suitable."
737+ else
738+ echo " Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
739+ PYTHON_CMD=" "
740+ fi
741+ fi
742+
743+ # Search for suitable Python versions if the default is unsuitable
744+ if [ -z " $PYTHON_CMD " ]; then
745+ # Loop through all available Python3 commands on the system
746+ for CMD in $( compgen -c | grep -E ' ^python3(\.[0-9]+)?$' ) ; do
747+ if command -v " $CMD " & > /dev/null; then
748+ PYTHON_VERSION=$( $CMD -c " import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" )
749+ if version_ge " $PYTHON_VERSION " " $REQUIRED_VERSION " ; then
750+ PYTHON_CMD=" $CMD "
751+ echo " Found suitable Python version: $PYTHON_VERSION ($CMD )"
752+ break
753+ fi
754+ fi
755+ done
756+ fi
757+
758+ # If no suitable Python version found, exit with an error
759+ if [ -z " $PYTHON_CMD " ]; then
760+ echo " Python 3.10 or higher is required. Please install it."
761+ return 1
762+ fi
763+
764+ echo " Using Python command: $PYTHON_CMD "
765+
766+ # Install falsa and other dependencies
767+ echo " Installing falsa..."
768+
769+ # Set virtual environment directory
770+ VIRTUAL_ENV=" ${PWD} /venv"
771+
772+ # Create a virtual environment using the detected Python command
773+ $PYTHON_CMD -m venv " $VIRTUAL_ENV "
774+
775+ # Activate the virtual environment and install dependencies
776+ source " $VIRTUAL_ENV /bin/activate"
777+
778+ # Ensure 'falsa' is installed (avoid unnecessary reinstall)
779+ pip install --quiet --upgrade falsa
780+
781+ # Create directory if it doesn't exist
782+ H2O_DIR=" ${DATA_DIR} /h2o"
783+ mkdir -p " ${H2O_DIR} "
784+
785+ # Generate h2o test data
786+ echo " Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT} "
787+ falsa join --path-prefix=" ${H2O_DIR} " --size " ${SIZE} " --data-format " ${DATA_FORMAT} "
788+
789+ # Deactivate virtual environment after completion
790+ deactivate
791+ }
792+
691793run_h2o () {
692794 # Default values for size and data format
693795 SIZE=${1:- " SMALL" }
@@ -700,7 +802,7 @@ run_h2o() {
700802 RESULTS_FILE=" ${RESULTS_DIR} /h2o.json"
701803
702804 echo " RESULTS_FILE: ${RESULTS_FILE} "
703- echo " Running h2o benchmark..."
805+ echo " Running h2o groupby benchmark..."
704806
705807 # Set the file name based on the size
706808 case " $SIZE " in
@@ -730,6 +832,56 @@ run_h2o() {
730832 -o " ${RESULTS_FILE} "
731833}
732834
835+ run_h2o_join () {
836+ # Default values for size and data format
837+ SIZE=${1:- " SMALL" }
838+ DATA_FORMAT=${2:- " CSV" }
839+ DATA_FORMAT=$( echo " $DATA_FORMAT " | tr ' [:upper:]' ' [:lower:]' )
840+ RUN_Type=${3:- " join" }
841+
842+ # Data directory and results file path
843+ H2O_DIR=" ${DATA_DIR} /h2o"
844+ RESULTS_FILE=" ${RESULTS_DIR} /h2o_join.json"
845+
846+ echo " RESULTS_FILE: ${RESULTS_FILE} "
847+ echo " Running h2o join benchmark..."
848+
849+ # Set the file name based on the size
850+ case " $SIZE " in
851+ " SMALL" )
852+ X_TABLE_FILE_NAME=" J1_1e7_NA_0.${DATA_FORMAT} "
853+ SMALL_TABLE_FILE_NAME=" J1_1e7_1e1_0.${DATA_FORMAT} "
854+ MEDIUM_TABLE_FILE_NAME=" J1_1e7_1e4_0.${DATA_FORMAT} "
855+ LARGE_TABLE_FILE_NAME=" J1_1e7_1e7_NA.${DATA_FORMAT} "
856+ ;;
857+ " MEDIUM" )
858+ X_TABLE_FILE_NAME=" J1_1e8_NA_0.${DATA_FORMAT} "
859+ SMALL_TABLE_FILE_NAME=" J1_1e8_1e2_0.${DATA_FORMAT} "
860+ MEDIUM_TABLE_FILE_NAME=" J1_1e8_1e5_0.${DATA_FORMAT} "
861+ LARGE_TABLE_FILE_NAME=" J1_1e8_1e8_NA.${DATA_FORMAT} "
862+ ;;
863+ " BIG" )
864+ X_TABLE_FILE_NAME=" J1_1e9_NA_0.${DATA_FORMAT} "
865+ SMALL_TABLE_FILE_NAME=" J1_1e9_1e3_0.${DATA_FORMAT} "
866+ MEDIUM_TABLE_FILE_NAME=" J1_1e9_1e6_0.${DATA_FORMAT} "
867+ LARGE_TABLE_FILE_NAME=" J1_1e9_1e9_NA.${DATA_FORMAT} "
868+ ;;
869+ * )
870+ echo " Invalid size. Valid options are SMALL, MEDIUM, or BIG."
871+ return 1
872+ ;;
873+ esac
874+
875+ # Set the query file name based on the RUN_Type
876+ QUERY_FILE=" ${SCRIPT_DIR} /queries/h2o/${RUN_Type} .sql"
877+
878+ $CARGO_COMMAND --bin dfbench -- h2o \
879+ --iterations 3 \
880+ --join-paths " ${H2O_DIR} /${X_TABLE_FILE_NAME} ,${H2O_DIR} /${SMALL_TABLE_FILE_NAME} ,${H2O_DIR} /${MEDIUM_TABLE_FILE_NAME} ,${H2O_DIR} /${LARGE_TABLE_FILE_NAME} " \
881+ --queries-path " ${QUERY_FILE} " \
882+ -o " ${RESULTS_FILE} "
883+ }
884+
733885# Runs the external aggregation benchmark
734886run_external_aggr () {
735887 # Use TPC-H SF1 dataset
0 commit comments