kosiew
diff --git a/‎.github/actions/setup-builder/action.yaml‎
Lines changed: 14 additions & 0 deletions b/‎.github/actions/setup-builder/action.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/audit.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/audit.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/rust.yml‎
Lines changed: 2 additions & 27 deletions b/‎.github/workflows/rust.yml‎
Lines changed: 2 additions & 27 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 1 addition & 18 deletions b/‎benchmarks/README.md‎
Lines changed: 1 addition & 18 deletions
diff --git a/‎benchmarks/bench.sh‎
Lines changed: 51 additions & 47 deletions b/‎benchmarks/bench.sh‎
Lines changed: 51 additions & 47 deletions
diff --git a/‎benchmarks/src/bin/dfbench.rs‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/src/bin/dfbench.rs‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/src/bin/external_aggr.rs‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/src/bin/external_aggr.rs‎
Lines changed: 1 addition & 1 deletion
@@ -46,3 +46,17 @@ runs:
       # https://github.com/actions/checkout/issues/766
       shell: bash
       run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+    - name: Remove unnecessary preinstalled software
+      shell: bash
+      run: |
+        echo "Disk space before cleanup:"
+        df -h 
+        apt-get clean
+        # remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
+        rm -rf /__t/* || true
+        # remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
+        rm -rf /host/usr/local/.ghcup || true
+        # remove Android library: about 7.8GB (host /usr/local/lib/android)
+        rm -rf /host/usr/local/lib/android || true
+        echo "Disk space after cleanup:"
+        df -h
@@ -42,7 +42,7 @@ jobs:
     steps:
       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8  # v6.0.1
       - name: Install cargo-audit
-        uses: taiki-e/install-action@92e6dd1c202153a204d471a3c509bf1e03dcfa44  # v2.62.61
+        uses: taiki-e/install-action@493d7f216ecab2af0602481ce809ab2c72836fa1  # v2.62.62
         with:
           tool: cargo-audit
       - name: Run audit check
 
@@ -272,18 +272,6 @@ jobs:
       volumes:
         - /usr/local:/host/usr/local
     steps:
-      - name: Remove unnecessary preinstalled software
-        run: |
-          echo "Disk space before cleanup:"
-          df -h
-          # remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
-          rm -rf /__t/* || true
-          # remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
-          rm -rf /host/usr/local/.ghcup || true
-          # remove Android library: about 7.8GB (host /usr/local/lib/android)
-          rm -rf /host/usr/local/lib/android || true
-          echo "Disk space after cleanup:"
-          df -h
       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8  # v6.0.1
         with:
           submodules: true
@@ -374,19 +362,6 @@ jobs:
         with:
           save-if: ${{ github.ref_name == 'main' }}
           shared-key: "amd-ci-linux-test-example"
-      - name: Remove unnecessary preinstalled software
-        run: |
-          echo "Disk space before cleanup:"
-          df -h
-          apt-get clean
-          rm -rf /__t/CodeQL
-          rm -rf /__t/PyPy
-          rm -rf /__t/Java_Temurin-Hotspot_jdk
-          rm -rf /__t/Python
-          rm -rf /__t/go
-          rm -rf /__t/Ruby
-          echo "Disk space after cleanup:"
-          df -h
       - name: Run examples
         run: |
           # test datafusion-sql examples
@@ -446,7 +421,7 @@ jobs:
           sudo apt-get update -qq
           sudo apt-get install -y -qq clang
       - name: Setup wasm-pack
-        uses: taiki-e/install-action@92e6dd1c202153a204d471a3c509bf1e03dcfa44  # v2.62.61
+        uses: taiki-e/install-action@493d7f216ecab2af0602481ce809ab2c72836fa1  # v2.62.62
         with:
           tool: wasm-pack
       - name: Run tests with headless mode
@@ -749,7 +724,7 @@ jobs:
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
       - name: Install cargo-msrv
-        uses: taiki-e/install-action@92e6dd1c202153a204d471a3c509bf1e03dcfa44  # v2.62.61
+        uses: taiki-e/install-action@493d7f216ecab2af0602481ce809ab2c72836fa1  # v2.62.62
         with:
           tool: cargo-msrv
 
 
@@ -243,28 +243,11 @@ See the help for more details.
 You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`. For example:
 
 ```shell
-cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
-```
-
-The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
-(generated by the `dbgen` utility) to CSV and Parquet.
-
-```bash
-cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
+cargo run --release --features "mimalloc" --bin dfbench tpch --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
 ```
 
 Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
 
-#### Sorted Conversion
-
-The TPCH tables generated by the dbgen utility are sorted by their first column (their primary key for most tables, the `l_orderkey` column for the `lineitem` table.)
-
-To preserve this sorted order information during conversion (useful for benchmarking execution on pre-sorted data) include the `--sort` flag:
-
-```bash
-cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-sorted-parquet --format parquet --sort
-```
-
 ### Comparing results between runs
 
 Any `dfbench` execution with `-o <dir>` argument will produce a
 
@@ -189,8 +189,8 @@ main() {
             echo "***************************"
             case "$BENCHMARK" in
                 all)
-                    data_tpch "1"
-                    data_tpch "10"
+                    data_tpch "1" "parquet"
+                    data_tpch "10" "parquet"
                     data_h2o "SMALL"
                     data_h2o "MEDIUM"
                     data_h2o "BIG"
@@ -203,26 +203,22 @@ main() {
                     # nlj uses range() function, no data generation needed
                     ;;
                 tpch)
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 tpch_mem)
-                    # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 tpch_csv)
-                    # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "csv"
                     ;;
                 tpch10)
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
                     ;;
                 tpch_mem10)
-                    # same data as for tpch10
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
                     ;;
                 tpch_csv10)
-                    # same data as for tpch10
-                    data_tpch "10"
+                    data_tpch "10" "csv"
                     ;;
                 clickbench_1)
                     data_clickbench_1
@@ -297,19 +293,19 @@ main() {
                     ;;
                 external_aggr)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 sort_tpch)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 sort_tpch10)
                     # same data as for tpch10
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
                     ;;
                 topk_tpch)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 nlj)
                     # nlj uses range() function, no data generation needed
@@ -320,7 +316,7 @@ main() {
                     echo "HJ benchmark does not require data generation"
                     ;;
                 compile_profile)
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
@@ -537,7 +533,7 @@ main() {
 # Creates TPCH data at a certain scale factor, if it doesn't already
 # exist
 #
-# call like: data_tpch($scale_factor)
+# call like: data_tpch($scale_factor, format)
 #
 # Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
 # Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -548,20 +544,23 @@ data_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
+    FORMAT=$2
+    if [ -z "$FORMAT" ] ; then
+        echo "Internal error: Format not specified"
+        exit 1
+    fi
 
     TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-    echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
+    echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
 
     # Ensure the target data directory exists
     mkdir -p "${TPCH_DIR}"
 
-    # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
-    FILE="${TPCH_DIR}/supplier.tbl"
-    if test -f "${FILE}"; then
-        echo " tbl files exist ($FILE exists)."
-    else
-        echo " creating tbl files with tpch_dbgen..."
-        docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
+    # check if tpchgen-cli is installed
+    if ! command -v tpchgen-cli &> /dev/null
+    then
+        echo "tpchgen-cli could not be found, please install it via 'cargo install tpchgen-cli'"
+        exit 1
     fi
 
     # Copy expected answers into the ./data/answers directory if it does not already exist
@@ -574,27 +573,32 @@ data_tpch() {
         docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main  -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
     fi
 
-    # Create 'parquet' files from tbl
-    FILE="${TPCH_DIR}/supplier"
-    if test -d "${FILE}"; then
-        echo " parquet files exist ($FILE exists)."
-    else
-        echo " creating parquet files using benchmark binary ..."
-        pushd "${SCRIPT_DIR}" > /dev/null
-        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
-        popd > /dev/null
+    if [ "$FORMAT" = "parquet" ]; then
+      # Create 'parquet' files, one directory per file
+      FILE="${TPCH_DIR}/supplier"
+      if test -d "${FILE}"; then
+          echo " parquet files exist ($FILE exists)."
+      else
+          echo " creating parquet files using tpchgen-cli ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
+      fi
+      return
     fi
 
-    # Create 'csv' files from tbl
-    FILE="${TPCH_DIR}/csv/supplier"
-    if test -d "${FILE}"; then
-        echo " csv files exist ($FILE exists)."
-    else
-        echo " creating csv files using benchmark binary ..."
-        pushd "${SCRIPT_DIR}" > /dev/null
-        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}/csv" --format csv
-        popd > /dev/null
+    # Create 'csv' files, one directory per file
+    if [ "$FORMAT" = "csv" ]; then
+      FILE="${TPCH_DIR}/csv/supplier"
+      if test -d "${FILE}"; then
+          echo " csv files exist ($FILE exists)."
+      else
+          echo " creating csv files using tpchgen-cli binary ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
+      fi
+      return
     fi
+
+    echo "Error: unknown format '$FORMAT' for tpch data generation, expected 'parquet' or 'csv'"
+    exit 1
 }
 
 # Runs the tpch benchmark
@@ -611,10 +615,10 @@ run_tpch() {
     echo "Running tpch benchmark..."
 
     FORMAT=$2
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
-# Runs the tpch in memory
+# Runs the tpch in memory (needs tpch parquet data)
 run_tpch_mem() {
     SCALE_FACTOR=$1
     if [ -z "$SCALE_FACTOR" ] ; then
@@ -627,7 +631,7 @@ run_tpch_mem() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
     # -m means in memory
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
 # Runs the compile profile benchmark helper
 
@@ -48,7 +48,6 @@ enum Options {
     Nlj(nlj::RunOpt),
     SortTpch(sort_tpch::RunOpt),
     Tpch(tpch::RunOpt),
-    TpchConvert(tpch::ConvertOpt),
 }
 
 // Main benchmark runner entrypoint
@@ -65,6 +64,5 @@ pub async fn main() -> Result<()> {
         Options::Nlj(opt) => opt.run().await,
         Options::SortTpch(opt) => opt.run().await,
         Options::Tpch(opt) => Box::pin(opt.run()).await,
-        Options::TpchConvert(opt) => opt.run().await,
     }
 }
@@ -34,7 +34,6 @@ use datafusion::datasource::listing::{
 use datafusion::datasource::{MemTable, TableProvider};
 use datafusion::error::Result;
 use datafusion::execution::memory_pool::FairSpillPool;
-use datafusion::execution::memory_pool::{human_readable_size, units};
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::execution::SessionStateBuilder;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
@@ -44,6 +43,7 @@ use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt, QueryResult};
 use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
+use datafusion_common::{human_readable_size, units};
 
 #[derive(Debug, StructOpt)]
 #[structopt(
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,6 @@ enum Options {`
`48`	`48`	`Nlj(nlj::RunOpt),`
`49`	`49`	`SortTpch(sort_tpch::RunOpt),`
`50`	`50`	`Tpch(tpch::RunOpt),`
`51`		`- TpchConvert(tpch::ConvertOpt),`
`52`	`51`	`}`
`53`	`52`
`54`	`53`	`// Main benchmark runner entrypoint`
`@@ -65,6 +64,5 @@ pub async fn main() -> Result<()> {`
`65`	`64`	`Options::Nlj(opt) => opt.run().await,`
`66`	`65`	`Options::SortTpch(opt) => opt.run().await,`
`67`	`66`	`Options::Tpch(opt) => Box::pin(opt.run()).await,`
`68`		`- Options::TpchConvert(opt) => opt.run().await,`
`69`	`67`	`}`
`70`	`68`	`}`