@@ -33,25 +33,34 @@ Java and Rust must be installed locally.
3333
3434``` shell
3535cargo install tpchgen-cli
36+ mkdir benchmark_data
37+ cd benchmark_data
3638tpchgen-cli -s 100 --format=parquet
39+ export $BENCH_DATA =` pwd`
40+ ```
41+ Create a temp folder for spark events emitted during benchmarking
42+
43+ ``` shell
44+ mkdir /tmp/spark-events
3745```
3846
3947## Clone the DataFusion Benchmarks Repository
4048
4149``` shell
4250git clone https://github.com/apache/datafusion-benchmarks.git
51+ cd
52+ export DF_BENCH=` pwd`
4353```
4454
4555## Install Spark
4656
47- Install Spark
57+ Install Apache Spark. This example refers to 3.5.4 version.
4858
4959``` shell
5060wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
5161tar xzf spark-3.5.4-bin-hadoop3.tgz
5262sudo mv spark-3.5.4-bin-hadoop3 /opt
5363export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3/
54- mkdir /tmp/spark-events
5564```
5665
5766
@@ -99,10 +108,10 @@ $SPARK_HOME/bin/spark-submit \
99108 --conf spark.memory.offHeap.enabled=true \
100109 --conf spark.memory.offHeap.size=16g \
101110 --conf spark.eventLog.enabled=true \
102- /path/to/datafusion-benchmarks /runners/datafusion-comet/tpcbench.py \
111+ $DF_BENCH /runners/datafusion-comet/tpcbench.py \
103112 --benchmark tpch \
104- --data /Users/rusty/Data/ tpch/sf100 \
105- --queries /path/to/datafusion-benchmarks /tpch/queries \
113+ --data $BENCH_DATA / tpch-data/ \
114+ --queries $DF_BENCH /tpch/queries \
106115 --output . \
107116 --iterations 1
108117```
@@ -115,7 +124,7 @@ Build Comet from source, with `mimalloc` enabled.
115124make release COMET_FEATURES=mimalloc
116125```
117126
118- Set ` COMET_JAR ` to point to the location of the Comet jar file.
127+ Set ` COMET_JAR ` to point to the location of the Comet jar file. Example for Comet 0.8
119128
120129``` shell
121130export COMET_JAR=` pwd` /spark/target/comet-spark-spark3.5_2.12-0.8.0-SNAPSHOT.jar
@@ -145,10 +154,10 @@ $SPARK_HOME/bin/spark-submit \
145154 --conf spark.comet.exec.shuffle.fallbackToColumnar=true \
146155 --conf spark.comet.exec.replaceSortMergeJoin=true \
147156 --conf spark.comet.expression.allowIncompatible=true \
148- /path/to/datafusion-benchmarks /runners/datafusion-comet/tpcbench.py \
157+ $DF_BENCH /runners/datafusion-comet/tpcbench.py \
149158 --benchmark tpch \
150- --data /path/to /tpch-data/ \
151- --queries /path/to/datafusion-benchmarks/ /tpch/queries \
159+ --data $BENCH_DATA /tpch-data/ \
160+ --queries $DF_BENCH /tpch/queries \
152161 --output . \
153162 --iterations 1
154163```
0 commit comments