@@ -189,8 +189,8 @@ main() {
189189 echo " ***************************"
190190 case " $BENCHMARK " in
191191 all)
192- data_tpch " 1"
193- data_tpch " 10"
192+ data_tpch " 1" " parquet "
193+ data_tpch " 10" " parquet "
194194 data_h2o " SMALL"
195195 data_h2o " MEDIUM"
196196 data_h2o " BIG"
@@ -203,26 +203,22 @@ main() {
203203 # nlj uses range() function, no data generation needed
204204 ;;
205205 tpch)
206- data_tpch " 1"
206+ data_tpch " 1" " parquet "
207207 ;;
208208 tpch_mem)
209- # same data as for tpch
210- data_tpch " 1"
209+ data_tpch " 1" " parquet"
211210 ;;
212211 tpch_csv)
213- # same data as for tpch
214- data_tpch " 1"
212+ data_tpch " 1" " csv"
215213 ;;
216214 tpch10)
217- data_tpch " 10"
215+ data_tpch " 10" " parquet "
218216 ;;
219217 tpch_mem10)
220- # same data as for tpch10
221- data_tpch " 10"
218+ data_tpch " 10" " parquet"
222219 ;;
223220 tpch_csv10)
224- # same data as for tpch10
225- data_tpch " 10"
221+ data_tpch " 10" " csv"
226222 ;;
227223 clickbench_1)
228224 data_clickbench_1
@@ -297,19 +293,19 @@ main() {
297293 ;;
298294 external_aggr)
299295 # same data as for tpch
300- data_tpch " 1"
296+ data_tpch " 1" " parquet "
301297 ;;
302298 sort_tpch)
303299 # same data as for tpch
304- data_tpch " 1"
300+ data_tpch " 1" " parquet "
305301 ;;
306302 sort_tpch10)
307303 # same data as for tpch10
308- data_tpch " 10"
304+ data_tpch " 10" " parquet "
309305 ;;
310306 topk_tpch)
311307 # same data as for tpch
312- data_tpch " 1"
308+ data_tpch " 1" " parquet "
313309 ;;
314310 nlj)
315311 # nlj uses range() function, no data generation needed
@@ -320,7 +316,7 @@ main() {
320316 echo " HJ benchmark does not require data generation"
321317 ;;
322318 compile_profile)
323- data_tpch " 1"
319+ data_tpch " 1" " parquet "
324320 ;;
325321 * )
326322 echo " Error: unknown benchmark '$BENCHMARK ' for data generation"
@@ -537,7 +533,7 @@ main() {
537533# Creates TPCH data at a certain scale factor, if it doesn't already
538534# exist
539535#
540- # call like: data_tpch($scale_factor)
536+ # call like: data_tpch($scale_factor, format )
541537#
542538# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
543539# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -548,20 +544,23 @@ data_tpch() {
548544 echo " Internal error: Scale factor not specified"
549545 exit 1
550546 fi
547+ FORMAT=$2
548+ if [ -z " $FORMAT " ] ; then
549+ echo " Internal error: Format not specified"
550+ exit 1
551+ fi
551552
552553 TPCH_DIR=" ${DATA_DIR} /tpch_sf${SCALE_FACTOR} "
553- echo " Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR} ..."
554+ echo " Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR} ..."
554555
555556 # Ensure the target data directory exists
556557 mkdir -p " ${TPCH_DIR} "
557558
558- # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
559- FILE=" ${TPCH_DIR} /supplier.tbl"
560- if test -f " ${FILE} " ; then
561- echo " tbl files exist ($FILE exists)."
562- else
563- echo " creating tbl files with tpch_dbgen..."
564- docker run -v " ${TPCH_DIR} " :/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s " ${SCALE_FACTOR} "
559+ # check if tpchgen-cli is installed
560+ if ! command -v tpchgen-cli & > /dev/null
561+ then
562+ echo " tpchgen-cli could not be found, please install it via 'cargo install tpchgen-cli'"
563+ exit 1
565564 fi
566565
567566 # Copy expected answers into the ./data/answers directory if it does not already exist
@@ -574,27 +573,32 @@ data_tpch() {
574573 docker run -v " ${TPCH_DIR} " :/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c " cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
575574 fi
576575
577- # Create 'parquet' files from tbl
578- FILE=" ${TPCH_DIR} /supplier"
579- if test -d " ${FILE} " ; then
580- echo " parquet files exist ($FILE exists)."
581- else
582- echo " creating parquet files using benchmark binary ..."
583- pushd " ${SCRIPT_DIR} " > /dev/null
584- $CARGO_COMMAND --bin tpch -- convert --input " ${TPCH_DIR} " --output " ${TPCH_DIR} " --format parquet
585- popd > /dev/null
576+ if [ " $FORMAT " = " parquet" ]; then
577+ # Create 'parquet' files, one directory per file
578+ FILE=" ${TPCH_DIR} /supplier"
579+ if test -d " ${FILE} " ; then
580+ echo " parquet files exist ($FILE exists)."
581+ else
582+ echo " creating parquet files using tpchgen-cli ..."
583+ tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format parquet --parquet-compression=' ZSTD(1)' --parts=1 --output-dir " ${TPCH_DIR} "
584+ fi
585+ return
586586 fi
587587
588- # Create 'csv' files from tbl
589- FILE=" ${TPCH_DIR} /csv/supplier"
590- if test -d " ${FILE} " ; then
591- echo " csv files exist ($FILE exists)."
592- else
593- echo " creating csv files using benchmark binary ..."
594- pushd " ${SCRIPT_DIR} " > /dev/null
595- $CARGO_COMMAND --bin tpch -- convert --input " ${TPCH_DIR} " --output " ${TPCH_DIR} /csv" --format csv
596- popd > /dev/null
588+ # Create 'csv' files, one directory per file
589+ if [ " $FORMAT " = " csv" ]; then
590+ FILE=" ${TPCH_DIR} /csv/supplier"
591+ if test -d " ${FILE} " ; then
592+ echo " csv files exist ($FILE exists)."
593+ else
594+ echo " creating csv files using tpchgen-cli binary ..."
595+ tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format csv --parts=1 --output-dir " ${TPCH_DIR} /csv"
596+ fi
597+ return
597598 fi
599+
600+ echo " Error: unknown format '$FORMAT ' for tpch data generation, expected 'parquet' or 'csv'"
601+ exit 1
598602}
599603
600604# Runs the tpch benchmark
@@ -611,10 +615,10 @@ run_tpch() {
611615 echo " Running tpch benchmark..."
612616
613617 FORMAT=$2
614- debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format ${FORMAT} -o " ${RESULTS_FILE} " ${QUERY_ARG}
618+ debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format ${FORMAT} -o " ${RESULTS_FILE} " ${QUERY_ARG}
615619}
616620
617- # Runs the tpch in memory
621+ # Runs the tpch in memory (needs tpch parquet data)
618622run_tpch_mem () {
619623 SCALE_FACTOR=$1
620624 if [ -z " $SCALE_FACTOR " ] ; then
@@ -627,7 +631,7 @@ run_tpch_mem() {
627631 echo " RESULTS_FILE: ${RESULTS_FILE} "
628632 echo " Running tpch_mem benchmark..."
629633 # -m means in memory
630- debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " -m --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG}
634+ debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " -m --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG}
631635}
632636
633637# Runs the compile profile benchmark helper
0 commit comments