datafusion-contrib
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/gen-clickbench.sh‎
Lines changed: 22 additions & 0 deletions b/‎benchmarks/gen-clickbench.sh‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎benchmarks/src/main.rs‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/src/main.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/src/prepare_clickbench.rs‎
Lines changed: 33 additions & 0 deletions b/‎benchmarks/src/prepare_clickbench.rs‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎benchmarks/src/run.rs‎
Lines changed: 19 additions & 15 deletions b/‎benchmarks/src/run.rs‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎src/test_utils/clickbench.rs‎
Lines changed: 106 additions & 0 deletions b/‎src/test_utils/clickbench.rs‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/test_utils/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎src/test_utils/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎testdata/clickbench/queries/q0.sql‎
Lines changed: 4 additions & 0 deletions b/‎testdata/clickbench/queries/q0.sql‎
Lines changed: 4 additions & 0 deletions
@@ -59,6 +59,17 @@ jobs:
           key: "main.zip"
       - run: cargo test --features tpcds --test 'tpcds_*'
 
+  clickbench-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/setup
+      - uses: actions/cache@v4
+        with:
+          path: testdata/clickbench/
+          key: "data"
+      - run: cargo test --features clickbench --test 'clickbench_*'
+
   format-check:
     runs-on: ubuntu-latest
     steps:
 
@@ -6,3 +6,5 @@ testdata/tpch/*
 testdata/tpcds/*
 !testdata/tpcds/queries
 !testdata/tpcds/README.md
+testdata/clickbench/*
+!testdata/clickbench/queries
@@ -67,6 +67,7 @@ integration = [
 
 tpch = ["integration"]
 tpcds = ["integration"]
+clickbench = ["integration"]
 
 [dev-dependencies]
 structopt = "0.3"
 
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+PARTITION_START=${PARTITION_START:-0}
+PARTITION_END=${PARTITION_END:-100}
+
+echo "Generating ClickBench dataset"
+
+
+# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
+CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
+CLICKBENCH_DIR="${DATA_DIR}/clickbench_${PARTITION_START}-${PARTITION_END}"
+
+echo "Creating clickbench dataset from partition ${PARTITION_START} to ${PARTITION_END}"
+
+# Ensure the target data directory exists
+mkdir -p "${CLICKBENCH_DIR}"
+
+$CARGO_COMMAND -- prepare-clickbench --output "${CLICKBENCH_DIR}" --partition-start "$PARTITION_START" --partition-end "$PARTITION_END"
@@ -1,4 +1,5 @@
 //! DataFusion Distributed benchmark runner
+mod prepare_clickbench;
 mod prepare_tpcds;
 mod prepare_tpch;
 mod run;
@@ -12,6 +13,7 @@ enum Options {
     Run(run::RunOpt),
     PrepareTpch(prepare_tpch::PrepareTpchOpt),
     PrepareTpcds(prepare_tpcds::PrepareTpcdsOpt),
+    PrepareClickbench(prepare_clickbench::PrepareClickBenchOpt),
 }
 
 // Main benchmark runner entrypoint
@@ -28,5 +30,9 @@ pub fn main() -> Result<()> {
             let rt = tokio::runtime::Runtime::new()?;
             rt.block_on(async { opt.run().await })
         }
+        Options::PrepareClickbench(opt) => {
+            let rt = tokio::runtime::Runtime::new()?;
+            rt.block_on(async { opt.run().await })
+        }
     }
 }
@@ -0,0 +1,33 @@
+use datafusion::error::DataFusionError;
+use datafusion_distributed::test_utils::clickbench;
+use std::path::{Path, PathBuf};
+use structopt::StructOpt;
+
+/// Prepare ClickBench parquet files for benchmarks
+#[derive(Debug, StructOpt)]
+pub struct PrepareClickBenchOpt {
+    /// Output path
+    #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
+    output_path: PathBuf,
+
+    /// Clickbench dataset is partitioned in 100 files. You may not want to use all the files for
+    /// the benchmark, so this allows setting from which file partition to start.
+    #[structopt(long, default_value = "0")]
+    partition_start: usize,
+
+    /// Clickbench dataset is partitioned in 100 files. You may not want to use all the files for
+    /// the benchmark, so this allows setting a maximum in the file partition index.
+    #[structopt(long, default_value = "100")]
+    partition_end: usize,
+}
+
+impl PrepareClickBenchOpt {
+    pub async fn run(self) -> datafusion::common::Result<()> {
+        clickbench::generate_clickbench_data(
+            Path::new(&self.output_path),
+            self.partition_start..self.partition_end,
+        )
+        .await
+        .map_err(|e| DataFusionError::Internal(format!("{e:?}")))
+    }
+}
@@ -29,7 +29,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::*;
 use datafusion_distributed::test_utils::localhost::LocalHostWorkerResolver;
-use datafusion_distributed::test_utils::{tpcds, tpch};
+use datafusion_distributed::test_utils::{clickbench, tpcds, tpch};
 use datafusion_distributed::{
     ArrowFlightEndpoint, DistributedExt, DistributedPhysicalOptimizerRule, NetworkBoundaryExt,
 };
@@ -115,26 +115,27 @@ pub struct RunOpt {
 enum Dataset {
     Tpch,
     Tpcds,
+    Clickbench,
 }
 
 impl Dataset {
     fn infer_from_data_path(path: PathBuf) -> Result<Self, DataFusionError> {
-        if path
-            .iter()
-            .any(|v| v.to_str().is_some_and(|v| v.contains("tpch")))
-        {
-            return Ok(Self::Tpch);
+        fn path_contains(path: &Path, substr: &str) -> bool {
+            path.iter()
+                .any(|v| v.to_str().is_some_and(|v| v.contains(substr)))
         }
-        if path
-            .iter()
-            .any(|v| v.to_str().is_some_and(|v| v.contains("tpcds")))
-        {
-            return Ok(Self::Tpcds);
+        if path_contains(&path, "tpch") {
+            Ok(Self::Tpch)
+        } else if path_contains(&path, "tpcds") {
+            Ok(Self::Tpcds)
+        } else if path_contains(&path, "clickbench") {
+            Ok(Self::Clickbench)
+        } else {
+            not_impl_err!(
+                "Cannot infer benchmark dataset from path {}",
+                path.display()
+            )
         }
-        not_impl_err!(
-            "Cannot infer benchmark dataset from path {}",
-            path.display()
-        )
     }
 
     fn queries(&self) -> Result<Vec<(usize, String)>, DataFusionError> {
@@ -145,6 +146,9 @@ impl Dataset {
             Dataset::Tpcds => (1..99 + 1)
                 .map(|i| Ok((i, tpcds::get_test_tpcds_query(i)?)))
                 .collect(),
+            Dataset::Clickbench => (0..42 + 1)
+                .map(|i| Ok((i, clickbench::get_test_clickbench_query(i)?)))
+                .collect(),
         }
     }
 }
 
@@ -0,0 +1,106 @@
+use datafusion::common::{DataFusionError, internal_datafusion_err, internal_err};
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use std::fs;
+use std::io::Write;
+use std::ops::Range;
+use std::path::{Path, PathBuf};
+use tokio::task::JoinSet;
+
+const URL: &str =
+    "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet";
+
+/// Load a single ClickBench query by ID (0-42).
+pub fn get_test_clickbench_query(id: usize) -> Result<String, DataFusionError> {
+    let queries_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("testdata/clickbench/queries");
+
+    if !queries_dir.exists() {
+        return internal_err!(
+            "TPC-DS queries directory not found: {}",
+            queries_dir.display()
+        );
+    }
+
+    let query_file = queries_dir.join(format!("q{id}.sql"));
+
+    if !query_file.exists() {
+        return internal_err!("Query file not found: {}", query_file.display());
+    }
+
+    let query_sql = fs::read_to_string(&query_file)
+        .map_err(|e| {
+            internal_datafusion_err!("Failed to read query file {}: {e}", query_file.display())
+        })?
+        .trim()
+        .to_string();
+
+    Ok(query_sql)
+}
+
+/// Downloads the datafusion-benchmarks repository as a zip file
+async fn download_benchmark(
+    dest_path: PathBuf,
+    i: usize,
+) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    if dest_path.exists() {
+        return Ok(());
+    }
+
+    // Create directory if it doesn't exist
+    if let Some(parent) = dest_path.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    // Download the file
+    let response = reqwest::get(URL.replace("{}", &i.to_string())).await?;
+    let bytes = response.bytes().await?;
+
+    // Write to file
+    let mut file = fs::File::create(&dest_path)?;
+    file.write_all(&bytes)?;
+
+    println!("Downloaded to {}", dest_path.display());
+
+    Ok(())
+}
+
+async fn download_partitioned(
+    dest_path: PathBuf,
+    range: Range<usize>,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let mut join_set = JoinSet::new();
+    for i in range {
+        let dest_path = dest_path.clone();
+        join_set.spawn(async move {
+            download_benchmark(dest_path.join("hits").join(format!("{i}.parquet")), i).await
+        });
+    }
+    join_set.join_all().await;
+    Ok(())
+}
+
+pub async fn generate_clickbench_data(
+    dest_path: &Path,
+    range: Range<usize>,
+) -> Result<(), Box<dyn std::error::Error>> {
+    download_partitioned(dest_path.to_path_buf(), range).await?;
+    Ok(())
+}
+
+pub async fn register_tables(
+    ctx: &SessionContext,
+    data_path: &Path,
+) -> Result<(), DataFusionError> {
+    for entry in fs::read_dir(data_path)? {
+        let path = entry?.path();
+        if path.is_dir() {
+            let table_name = path.file_name().unwrap().to_str().unwrap();
+            ctx.register_parquet(
+                table_name,
+                path.to_str().unwrap(),
+                ParquetReadOptions::default(),
+            )
+            .await?;
+        }
+    }
+    Ok(())
+}
@@ -1,3 +1,4 @@
+pub mod clickbench;
 pub mod in_memory_channel_resolver;
 pub mod insta;
 pub mod localhost;
 
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+
+-- set datafusion.execution.parquet.binary_as_string = true
+SELECT COUNT(*) FROM hits;
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`//! DataFusion Distributed benchmark runner`
	`2`	`+mod prepare_clickbench;`
`2`	`3`	`mod prepare_tpcds;`
`3`	`4`	`mod prepare_tpch;`
`4`	`5`	`mod run;`
`@@ -12,6 +13,7 @@ enum Options {`
`12`	`13`	`Run(run::RunOpt),`
`13`	`14`	`PrepareTpch(prepare_tpch::PrepareTpchOpt),`
`14`	`15`	`PrepareTpcds(prepare_tpcds::PrepareTpcdsOpt),`
	`16`	`+ PrepareClickbench(prepare_clickbench::PrepareClickBenchOpt),`
`15`	`17`	`}`
`16`	`18`
`17`	`19`	`// Main benchmark runner entrypoint`
`@@ -28,5 +30,9 @@ pub fn main() -> Result<()> {`
`28`	`30`	`let rt = tokio::runtime::Runtime::new()?;`
`29`	`31`	`rt.block_on(async { opt.run().await })`
`30`	`32`	`}`
	`33`	`+ Options::PrepareClickbench(opt) => {`
	`34`	`+ let rt = tokio::runtime::Runtime::new()?;`
	`35`	`+ rt.block_on(async { opt.run().await })`
	`36`	`+ }`
`31`	`37`	`}`
`32`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+pub mod clickbench;`
`1`	`2`	`pub mod in_memory_channel_resolver;`
`2`	`3`	`pub mod insta;`
`3`	`4`	`pub mod localhost;`