datafusion-contrib
diff --git a/‎benchmarks/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/README.md‎
Lines changed: 16 additions & 61 deletions b/‎benchmarks/README.md‎
Lines changed: 16 additions & 61 deletions
diff --git a/‎benchmarks/gen-tpcds.sh‎
Lines changed: 21 additions & 0 deletions b/‎benchmarks/gen-tpcds.sh‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎benchmarks/gen-tpch.sh‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/gen-tpch.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/run.sh‎
Lines changed: 3 additions & 3 deletions b/‎benchmarks/run.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmarks/src/bin/dfbench.rs‎
Lines changed: 0 additions & 43 deletions b/‎benchmarks/src/bin/dfbench.rs‎
Lines changed: 0 additions & 43 deletions
diff --git a/‎benchmarks/src/lib.rs‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/src/lib.rs‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/src/main.rs‎
Lines changed: 32 additions & 0 deletions b/‎benchmarks/src/main.rs‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎benchmarks/src/prepare_tpcds.rs‎
Lines changed: 28 additions & 0 deletions b/‎benchmarks/src/prepare_tpcds.rs‎
Lines changed: 28 additions & 0 deletions
@@ -30,7 +30,7 @@ aws-sdk-ec2 = "1"
 
 [[bin]]
 name = "dfbench"
-path = "src/bin/dfbench.rs"
+path = "src/main.rs"
 
 [[bin]]
 name = "worker"
 
@@ -1,81 +1,36 @@
 # Distributed DataFusion Benchmarks
 
-### Generating TPCH data
+### Generating Benchmarking data
 
 Generate TPCH data into the `data/` dir
 
 ```shell
 ./gen-tpch.sh
+./gen-tpcds.sh
 ```
 
-### Running TPCH benchmarks in single-node mode
+### Running Benchmarks in single-node mode
 
-After generating the data with the command above, the benchmarks can be run with
+After generating the data with the command above, the benchmarks can be run with:
 
 ```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch
+WORKERS=0 ./benchmarks/run.sh --threads 2 --path benchmarks/data/tpch_sf1
 ```
 
-For preloading the TPCH data in-memory, the `-m` flag can be passed
+- `--threads`: This is the physical threads that the Tokio runtime will use for executing the binary.
+  It's recommended to set `--threads` to something small, like `2`, for throttling each individual
+  process running queries, and simulate how adding throttled workers can speed up the queries.
+- `--path`: It can point to any folder containing benchmark datasets.
 
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m
-```
-
-For running the benchmarks with using just a specific amount of physical threads:
-
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 3
-```
-
-### Running TPCH benchmarks in distributed mode
-
-Running the benchmarks in distributed mode implies:
-
-- running 1 or more workers in separate terminals
-- running the benchmarks in an additional terminal
-
-The workers can be spawned by passing the `--spawn <port>` flag, for example, for spawning 3 workers:
-
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch --spawn 8000
-```
-
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch --spawn 8001
-```
-
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch --spawn 8002
-```
-
-With the three workers running in separate terminals, the TPCH benchmarks can be run in distributed mode with:
-
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch --workers 8000,8001,8002
-```
+### Running Benchmarks benchmarks in distributed mode
 
-A good way of measuring the impact of distribution is to limit the physical threads each worker can use. For example,
-it's expected that running 8 workers with 2 physical threads each one (8 * 2 = 16 total) is faster than running in
-single-node with just 2 threads (1 * 3 = 2 total).
+The same script is used for running distributed benchmarks:
 
 ```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8000 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8001 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8002 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8003 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8004 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8005 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8006 & 
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --spawn 8007 & 
+WORKERS=8 ./benchmarks/run.sh --threads 2 --path ./benchmarks/data/tpch_sf1 --files-per-task 2
 ```
 
-```shell
-cargo run -p datafusion-distributed-benchmarks --release -- tpch -m --threads 2 --workers 8000,8001,8002,8003,8004,8005,8006,8007
-```
-
-The `run.sh` script already does this for you in a more ergonomic way:
-
-```shell
-WORKERS=8 run.sh --threads 2 -m
-```
+- `WORKERS`: Env variable that sets the amount of localhost workers used in the query.
+- `--threads`: Sets the Tokio runtime threads for each individual worker and for the benchmarking binary.
+- `--path`: It can point to any folder containing benchmark datasets.
+- `--files-per-task`: How many files each distributed task will handle.
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -e
+
+SCALE_FACTOR=${SCALE_FACTOR:-1}
+PARTITIONS=${PARTITIONS:-16}
+
+echo "Generating TPC-DS dataset with SCALE_FACTOR=${SCALE_FACTOR} and PARTITIONS=${PARTITIONS}"
+
+# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
+CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
+TPCDS_DIR="${DATA_DIR}/tpcds_sf${SCALE_FACTOR}"
+
+echo "Creating tpcds dataset at Scale Factor ${SCALE_FACTOR} in ${TPCDS_DIR}..."
+
+# Ensure the target data directory exists
+mkdir -p "${TPCDS_DIR}"
+
+$CARGO_COMMAND -- prepare-tpcds --output "${TPCDS_DIR}" --partitions "$PARTITIONS"
@@ -33,6 +33,6 @@ if test -d "${FILE}"; then
 else
     echo " creating parquet files using benchmark binary ..."
     pushd "${SCRIPT_DIR}" > /dev/null
-    $CARGO_COMMAND -- tpch-convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet --partitions "$PARTITIONS"
+    $CARGO_COMMAND -- prepare-tpch --input "${TPCH_DIR}" --output "${TPCH_DIR}" --partitions "$PARTITIONS"
     popd > /dev/null
 fi
@@ -8,7 +8,7 @@ WORKERS=${WORKERS:-8}
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 
 if [ "$WORKERS" == "0" ]; then
-  cargo run -p datafusion-distributed-benchmarks --release -- tpch "$@"
+  cargo run -p datafusion-distributed-benchmarks --release -- run "$@"
   exit
 fi
 
@@ -38,12 +38,12 @@ cargo build -p datafusion-distributed-benchmarks --release
 
 trap cleanup EXIT INT TERM
 for i in $(seq 0 $((WORKERS-1))); do
-  "$SCRIPT_DIR"/../target/release/dfbench tpch --spawn $((8000+i)) "$@" &
+  "$SCRIPT_DIR"/../target/release/dfbench run --spawn $((8000+i)) "$@" &
 done
 
 echo "Waiting for worker ports to be ready..."
 for i in $(seq 0 $((WORKERS-1))); do
   wait_for_port $((8000+i))
 done
 
-"$SCRIPT_DIR"/../target/release/dfbench tpch --workers $(seq -s, 8000 $((8000+WORKERS-1))) "$@"
+"$SCRIPT_DIR"/../target/release/dfbench run --workers $(seq -s, 8000 $((8000+WORKERS-1))) "$@"
@@ -0,0 +1,32 @@
+//! DataFusion Distributed benchmark runner
+mod prepare_tpcds;
+mod prepare_tpch;
+mod run;
+
+use datafusion::error::Result;
+use structopt::StructOpt;
+
+#[derive(Debug, StructOpt)]
+#[structopt(about = "benchmark command")]
+enum Options {
+    Run(run::RunOpt),
+    PrepareTpch(prepare_tpch::PrepareTpchOpt),
+    PrepareTpcds(prepare_tpcds::PrepareTpcdsOpt),
+}
+
+// Main benchmark runner entrypoint
+pub fn main() -> Result<()> {
+    env_logger::init();
+
+    match Options::from_args() {
+        Options::Run(opt) => opt.run(),
+        Options::PrepareTpch(opt) => {
+            let rt = tokio::runtime::Runtime::new()?;
+            rt.block_on(async { opt.run().await })
+        }
+        Options::PrepareTpcds(opt) => {
+            let rt = tokio::runtime::Runtime::new()?;
+            rt.block_on(async { opt.run().await })
+        }
+    }
+}
@@ -0,0 +1,28 @@
+use datafusion::error::DataFusionError;
+use datafusion_distributed::test_utils::tpcds;
+use std::path::{Path, PathBuf};
+use structopt::StructOpt;
+
+/// Prepare TPC-DS parquet files for benchmarks
+#[derive(Debug, StructOpt)]
+pub struct PrepareTpcdsOpt {
+    /// Output path
+    #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
+    output_path: PathBuf,
+
+    /// Number of partitions to produce. By default, uses only 1 partition.
+    #[structopt(short = "n", long = "partitions", default_value = "1")]
+    partitions: usize,
+
+    /// Scale factor of the TPC-DS data
+    #[structopt(long, default_value = "1")]
+    sf: f64,
+}
+
+impl PrepareTpcdsOpt {
+    pub async fn run(self) -> datafusion::common::Result<()> {
+        tpcds::generate_tpcds_data(Path::new(&self.output_path), self.sf, self.partitions)
+            .await
+            .map_err(|e| DataFusionError::Internal(format!("{e:?}")))
+    }
+}