Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ members = ["benchmarks"]

[workspace.dependencies]
datafusion = { version = "49.0.0" }
datafusion-proto = { version = "49.0.0" }

[package]
name = "datafusion-distributed"
Expand All @@ -11,7 +12,7 @@ edition = "2021"

[dependencies]
datafusion = { workspace = true }
datafusion-proto = { version = "49.0.0" }
datafusion-proto = { workspace = true }
arrow-flight = "55.2.0"
async-trait = "0.1.88"
tokio = { version = "1.46.1", features = ["full"] }
Expand Down
10 changes: 4 additions & 6 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ default-run = "dfbench"

[dependencies]
datafusion = { workspace = true }
datafusion-proto = { workspace = true }
datafusion-distributed = { path = "..", features = ["integration"] }
tokio = { version = "1.46.1", features = ["full"] }
parquet = { version = "55.2.0" }
Expand All @@ -15,14 +16,11 @@ serde = "1.0.219"
serde_json = "1.0.141"
env_logger = "0.11.8"
async-trait = "0.1.88"
datafusion-proto = { version = "49.0.0", optional = true }
chrono = "0.4.41"
futures = "0.3.31"
dashmap = "6.1.0"
prost = "0.13.5"

[[bin]]
name = "dfbench"
path = "src/bin/dfbench.rs"

[features]
ci = [
"datafusion-proto"
]
51 changes: 26 additions & 25 deletions benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,21 @@ use datafusion::datasource::file_format::FileFormat;
use datafusion::datasource::listing::{
ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
};
use datafusion::datasource::{MemTable, TableProvider};
use datafusion::datasource::TableProvider;
use datafusion::error::{DataFusionError, Result};
use datafusion::execution::{SessionState, SessionStateBuilder};
use datafusion::physical_plan::display::DisplayableExecutionPlan;
use datafusion::physical_plan::{collect, displayable};
use datafusion::prelude::*;

use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
use crate::util::{
BenchmarkRun, CommonOpt, InMemoryCacheExecCodec, InMemoryDataSourceRule, QueryResult,
WarmingUpMarker,
};
use datafusion_distributed::test_utils::localhost::start_localhost_context;
use datafusion_distributed::{
DistributedPhysicalOptimizerRule, DistributedSessionBuilder, DistributedSessionBuilderContext,
DistributedExt, DistributedPhysicalOptimizerRule, DistributedSessionBuilder,
DistributedSessionBuilderContext,
};
use log::info;
use structopt::StructOpt;
Expand Down Expand Up @@ -115,18 +119,23 @@ pub struct RunOpt {
impl DistributedSessionBuilder for RunOpt {
async fn build_session_state(
&self,
_ctx: DistributedSessionBuilderContext,
ctx: DistributedSessionBuilderContext,
) -> Result<SessionState, DataFusionError> {
let mut builder = SessionStateBuilder::new().with_default_features();

let config = self
.common
.config()?
.with_collect_statistics(!self.disable_statistics)
.with_distributed_user_codec(InMemoryCacheExecCodec)
.with_distributed_option_extension_from_headers::<WarmingUpMarker>(&ctx.headers)?
.with_target_partitions(self.partitions());

let rt_builder = self.common.runtime_env_builder()?;

if self.mem_table {
builder = builder.with_physical_optimizer_rule(Arc::new(InMemoryDataSourceRule));
}
if self.distributed {
let mut rule = DistributedPhysicalOptimizerRule::new();
if let Some(partitions_per_task) = self.partitions_per_task {
Expand Down Expand Up @@ -191,8 +200,18 @@ impl RunOpt {

let sql = &get_query_sql(query_id)?;

let single_node_ctx = SessionContext::new();
self.register_tables(&single_node_ctx).await?;
// Warmup the cache for the in-memory mode.
if self.mem_table {
// put the WarmingUpMarker in the context, otherwise, queries will fail as the
// InMemoryCacheExec node will think they should already be warmed up.
let ctx = ctx
.clone()
.with_distributed_option_extension(WarmingUpMarker::warming_up())?;
for query in sql.iter() {
self.execute_query(&ctx, query).await?;
}
println!("Query {query_id} data loaded in memory");
}

for i in 0..self.iterations() {
let start = Instant::now();
Expand Down Expand Up @@ -225,30 +244,12 @@ impl RunOpt {
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
println!("Query {query_id} avg time: {avg:.2} ms");

// Print memory stats using mimalloc (only when compiled with --features mimalloc_extended)
print_memory_stats();

Ok(query_results)
}

async fn register_tables(&self, ctx: &SessionContext) -> Result<()> {
for table in TPCH_TABLES {
let table_provider = { self.get_table(ctx, table).await? };

if self.mem_table {
println!("Loading table '{table}' into memory");
let start = Instant::now();
let memtable =
MemTable::load(table_provider, Some(self.partitions()), &ctx.state()).await?;
println!(
"Loaded table '{}' into memory in {} ms",
table,
start.elapsed().as_millis()
);
ctx.register_table(*table, Arc::new(memtable))?;
} else {
ctx.register_table(*table, table_provider)?;
}
ctx.register_table(*table, self.get_table(ctx, table).await?)?;
}
Ok(())
}
Expand Down
Loading
Loading