@@ -42,6 +42,13 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
4242DATA_DIR=${DATA_DIR:- $SCRIPT_DIR / data}
4343CARGO_COMMAND=${CARGO_COMMAND:- " cargo run --release" }
4444PREFER_HASH_JOIN=${PREFER_HASH_JOIN:- true}
45+ SIMULATE_LATENCY=${SIMULATE_LATENCY:- false}
46+
47+ # Build latency arg based on SIMULATE_LATENCY setting
48+ LATENCY_ARG=" "
49+ if [ " $SIMULATE_LATENCY " = " true" ]; then
50+ LATENCY_ARG=" --simulate-latency"
51+ fi
4552
4653usage () {
4754 echo "
@@ -141,6 +148,7 @@ CARGO_COMMAND command that runs the benchmark binary
141148DATAFUSION_DIR directory to use (default $DATAFUSION_DIR )
142149RESULTS_NAME folder where the benchmark files are stored
143150PREFER_HASH_JOIN Prefer hash join algorithm (default true)
151+ SIMULATE_LATENCY Simulate object store latency to mimic S3 (default false)
144152DATAFUSION_* Set the given datafusion configuration
145153"
146154 exit 1
@@ -371,6 +379,7 @@ main() {
371379 echo " RESULTS_DIR: ${RESULTS_DIR} "
372380 echo " CARGO_COMMAND: ${CARGO_COMMAND} "
373381 echo " PREFER_HASH_JOIN: ${PREFER_HASH_JOIN} "
382+ echo " SIMULATE_LATENCY: ${SIMULATE_LATENCY} "
374383 echo " ***************************"
375384
376385 # navigate to the appropriate directory
@@ -655,7 +664,7 @@ run_tpch() {
655664 echo " Running tpch benchmark..."
656665
657666 FORMAT=$2
658- debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format ${FORMAT} -o " ${RESULTS_FILE} " ${QUERY_ARG}
667+ debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format ${FORMAT} -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
659668}
660669
661670# Runs the tpch in memory (needs tpch parquet data)
@@ -671,7 +680,7 @@ run_tpch_mem() {
671680 echo " RESULTS_FILE: ${RESULTS_FILE} "
672681 echo " Running tpch_mem benchmark..."
673682 # -m means in memory
674- debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " -m --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG}
683+ debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " -m --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
675684}
676685
677686# Runs the tpcds benchmark
@@ -691,7 +700,7 @@ run_tpcds() {
691700 echo " RESULTS_FILE: ${RESULTS_FILE} "
692701 echo " Running tpcds benchmark..."
693702
694- debug_run $CARGO_COMMAND --bin dfbench -- tpcds --iterations 5 --path " ${TPCDS_DIR} " --query_path " ../datafusion/core/tests/tpc-ds" --prefer_hash_join " ${PREFER_HASH_JOIN} " -o " ${RESULTS_FILE} " ${QUERY_ARG}
703+ debug_run $CARGO_COMMAND --bin dfbench -- tpcds --iterations 5 --path " ${TPCDS_DIR} " --query_path " ../datafusion/core/tests/tpc-ds" --prefer_hash_join " ${PREFER_HASH_JOIN} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
695704}
696705
697706# Runs the compile profile benchmark helper
@@ -713,7 +722,7 @@ run_cancellation() {
713722 RESULTS_FILE=" ${RESULTS_DIR} /cancellation.json"
714723 echo " RESULTS_FILE: ${RESULTS_FILE} "
715724 echo " Running cancellation benchmark..."
716- debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path " ${DATA_DIR} /cancellation" -o " ${RESULTS_FILE} "
725+ debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path " ${DATA_DIR} /cancellation" -o " ${RESULTS_FILE} " ${LATENCY_ARG}
717726}
718727
719728
@@ -767,15 +776,15 @@ run_clickbench_1() {
767776 RESULTS_FILE=" ${RESULTS_DIR} /clickbench_1.json"
768777 echo " RESULTS_FILE: ${RESULTS_FILE} "
769778 echo " Running clickbench (1 file) benchmark..."
770- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path " ${DATA_DIR} /hits.parquet" --queries-path " ${SCRIPT_DIR} /queries/clickbench/queries" -o " ${RESULTS_FILE} " ${QUERY_ARG}
779+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path " ${DATA_DIR} /hits.parquet" --queries-path " ${SCRIPT_DIR} /queries/clickbench/queries" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
771780}
772781
773782 # Runs the clickbench benchmark with the partitioned parquet dataset (100 files)
774783run_clickbench_partitioned () {
775784 RESULTS_FILE=" ${RESULTS_DIR} /clickbench_partitioned.json"
776785 echo " RESULTS_FILE: ${RESULTS_FILE} "
777786 echo " Running clickbench (partitioned, 100 files) benchmark..."
778- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path " ${DATA_DIR} /hits_partitioned" --queries-path " ${SCRIPT_DIR} /queries/clickbench/queries" -o " ${RESULTS_FILE} " ${QUERY_ARG}
787+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path " ${DATA_DIR} /hits_partitioned" --queries-path " ${SCRIPT_DIR} /queries/clickbench/queries" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
779788}
780789
781790
@@ -784,7 +793,7 @@ run_clickbench_pushdown() {
784793 RESULTS_FILE=" ${RESULTS_DIR} /clickbench_pushdown.json"
785794 echo " RESULTS_FILE: ${RESULTS_FILE} "
786795 echo " Running clickbench (partitioned, 100 files) benchmark with pushdown_filters=true, reorder_filters=true..."
787- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path " ${DATA_DIR} /hits_partitioned" --queries-path " ${SCRIPT_DIR} /queries/clickbench/queries" -o " ${RESULTS_FILE} " ${QUERY_ARG}
796+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path " ${DATA_DIR} /hits_partitioned" --queries-path " ${SCRIPT_DIR} /queries/clickbench/queries" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
788797}
789798
790799
@@ -793,7 +802,7 @@ run_clickbench_extended() {
793802 RESULTS_FILE=" ${RESULTS_DIR} /clickbench_extended.json"
794803 echo " RESULTS_FILE: ${RESULTS_FILE} "
795804 echo " Running clickbench (1 file) extended benchmark..."
796- debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path " ${DATA_DIR} /hits.parquet" --queries-path " ${SCRIPT_DIR} /queries/clickbench/extended" -o " ${RESULTS_FILE} " ${QUERY_ARG}
805+ debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path " ${DATA_DIR} /hits.parquet" --queries-path " ${SCRIPT_DIR} /queries/clickbench/extended" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
797806}
798807
799808# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
@@ -908,7 +917,7 @@ run_imdb() {
908917 RESULTS_FILE=" ${RESULTS_DIR} /imdb.json"
909918 echo " RESULTS_FILE: ${RESULTS_FILE} "
910919 echo " Running imdb benchmark..."
911- debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path " ${IMDB_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG}
920+ debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path " ${IMDB_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
912921}
913922
914923data_h2o () {
@@ -980,7 +989,7 @@ run_h2o() {
980989 --path " ${H2O_DIR} /${FILE_NAME} " \
981990 --queries-path " ${QUERY_FILE} " \
982991 -o " ${RESULTS_FILE} " \
983- ${QUERY_ARG}
992+ ${QUERY_ARG} ${LATENCY_ARG}
984993}
985994
986995# Utility function to run h2o join/window benchmark
@@ -1032,7 +1041,7 @@ h2o_runner() {
10321041 --join-paths " ${H2O_DIR} /${X_TABLE_FILE_NAME} ,${H2O_DIR} /${SMALL_TABLE_FILE_NAME} ,${H2O_DIR} /${MEDIUM_TABLE_FILE_NAME} ,${H2O_DIR} /${LARGE_TABLE_FILE_NAME} " \
10331042 --queries-path " ${QUERY_FILE} " \
10341043 -o " ${RESULTS_FILE} " \
1035- ${QUERY_ARG}
1044+ ${QUERY_ARG} ${LATENCY_ARG}
10361045}
10371046
10381047# Runners for h2o join benchmark
@@ -1073,7 +1082,7 @@ run_sort_tpch() {
10731082 echo " RESULTS_FILE: ${RESULTS_FILE} "
10741083 echo " Running sort tpch benchmark..."
10751084
1076- debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG}
1085+ debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
10771086}
10781087
10791088# Runs the sort tpch integration benchmark with limit 100 (topk)
@@ -1083,15 +1092,15 @@ run_topk_tpch() {
10831092 echo " RESULTS_FILE: ${RESULTS_FILE} "
10841093 echo " Running topk tpch benchmark..."
10851094
1086- $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " --limit 100 ${QUERY_ARG}
1095+ $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " --limit 100 ${QUERY_ARG} ${LATENCY_ARG}
10871096}
10881097
10891098# Runs the nlj benchmark
10901099run_nlj () {
10911100 RESULTS_FILE=" ${RESULTS_DIR} /nlj.json"
10921101 echo " RESULTS_FILE: ${RESULTS_FILE} "
10931102 echo " Running nlj benchmark..."
1094- debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o " ${RESULTS_FILE} " ${QUERY_ARG}
1103+ debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
10951104}
10961105
10971106# Runs the hj benchmark
@@ -1100,15 +1109,15 @@ run_hj() {
11001109 RESULTS_FILE=" ${RESULTS_DIR} /hj.json"
11011110 echo " RESULTS_FILE: ${RESULTS_FILE} "
11021111 echo " Running hj benchmark..."
1103- debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG}
1112+ debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
11041113}
11051114
11061115# Runs the smj benchmark
11071116run_smj () {
11081117 RESULTS_FILE=" ${RESULTS_DIR} /smj.json"
11091118 echo " RESULTS_FILE: ${RESULTS_FILE} "
11101119 echo " Running smj benchmark..."
1111- debug_run $CARGO_COMMAND --bin dfbench -- smj --iterations 5 -o " ${RESULTS_FILE} " ${QUERY_ARG}
1120+ debug_run $CARGO_COMMAND --bin dfbench -- smj --iterations 5 -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
11121121}
11131122
11141123
@@ -1250,7 +1259,7 @@ run_clickbench_sorted() {
12501259 --sorted-by " EventTime" \
12511260 -c datafusion.optimizer.prefer_existing_sort=true \
12521261 -o " ${RESULTS_FILE} " \
1253- ${QUERY_ARG}
1262+ ${QUERY_ARG} ${LATENCY_ARG}
12541263}
12551264
12561265
0 commit comments