Skip to content

Commit 3793971

Browse files
committed
chore(cubestore): Upgrade DF: fix join requirement extraction and PlanProperties for ClusterSend
1 parent 6316cfd commit 3793971

File tree

6 files changed

+84
-71
lines changed

6 files changed

+84
-71
lines changed

rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ pub fn push_aggregate_to_workers(
3434
// Router plan, replace partial aggregate with cluster send.
3535
Ok(Arc::new(
3636
cs.with_changed_schema(
37-
agg.schema().clone(),
3837
p.clone()
3938
.with_new_children(vec![cs.input_for_optimizations.clone()])?,
4039
),
@@ -43,7 +42,6 @@ pub fn push_aggregate_to_workers(
4342
// Worker plan, execute partial aggregate inside the worker.
4443
Ok(Arc::new(WorkerExec {
4544
input: p.clone().with_new_children(vec![w.input.clone()])?,
46-
schema: agg.schema().clone(),
4745
max_batch_rows: w.max_batch_rows,
4846
limit_and_reverse: w.limit_and_reverse.clone(),
4947
}))

rust/cubestore/cubestore/src/queryplanner/optimizations/rewrite_plan.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,23 @@ pub fn rewrite_plan_impl<'a, R: PlanRewriter>(
2525
let updated_ctx = f.enter_node(&p, ctx);
2626
let ctx = updated_ctx.as_ref().unwrap_or(ctx);
2727

28-
p.map_children(|c| rewrite_plan_impl(c, ctx, f))?
29-
.transform_parent(|n| f.rewrite(n, ctx).map(|new| Transformed::yes(new)))
28+
let join_context = match &p {
29+
LogicalPlan::Join(Join { left, right, .. }) => vec![
30+
(left.clone(), f.enter_join_left(&p, ctx)),
31+
(right.clone(), f.enter_join_right(&p, ctx)),
32+
],
33+
_ => Vec::new(),
34+
};
35+
36+
p.map_children(|c| {
37+
let next_ctx = join_context
38+
.iter()
39+
.find(|(n, _)| n.as_ref() == &c)
40+
.and_then(|(_, join_ctx)| join_ctx.as_ref())
41+
.unwrap_or(ctx);
42+
rewrite_plan_impl(c, next_ctx, f)
43+
})?
44+
.transform_parent(|n| f.rewrite(n, ctx).map(|new| Transformed::yes(new)))
3045

3146
// // First, update children.
3247
// let updated = match p {

rust/cubestore/cubestore/src/queryplanner/panic.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ impl ExecutionPlan for PanicWorkerExec {
143143
pub fn plan_panic_worker() -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
144144
Ok(Arc::new(WorkerExec {
145145
input: Arc::new(PanicWorkerExec::new()),
146-
schema: Arc::new(Schema::empty()),
147146
max_batch_rows: 1,
148147
limit_and_reverse: None,
149148
}))

rust/cubestore/cubestore/src/queryplanner/planning.rs

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ impl PlanRewriter for CollectConstraints {
613613
}
614614
join_on
615615
.iter()
616-
.map(|(l, _)| match l {
616+
.map(|(_, r)| match r {
617617
Expr::Column(c) => Some(c.name.to_string()),
618618
_ => None,
619619
})
@@ -1593,7 +1593,6 @@ impl ExtensionPlanner for CubeExtensionPlanner {
15931593
Ok(Some(self.plan_cluster_send(
15941594
input.clone(),
15951595
&cs.snapshots,
1596-
input.schema(),
15971596
false,
15981597
usize::MAX,
15991598
cs.limit_and_reverse.clone(),
@@ -1617,18 +1616,16 @@ impl CubeExtensionPlanner {
16171616
&self,
16181617
mut input: Arc<dyn ExecutionPlan>,
16191618
snapshots: &Vec<Snapshots>,
1620-
schema: SchemaRef,
16211619
use_streaming: bool,
16221620
max_batch_rows: usize,
16231621
limit_and_reverse: Option<(usize, bool)>,
16241622
) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
16251623
if snapshots.is_empty() {
1626-
return Ok(Arc::new(EmptyExec::new(schema)));
1624+
return Ok(Arc::new(EmptyExec::new(input.schema())));
16271625
}
16281626
// Note that MergeExecs are added automatically when needed.
16291627
if let Some(c) = self.cluster.as_ref() {
16301628
Ok(Arc::new(ClusterSendExec::new(
1631-
schema,
16321629
c.clone(),
16331630
self.serialized_plan.clone(),
16341631
snapshots,
@@ -1638,7 +1635,6 @@ impl CubeExtensionPlanner {
16381635
} else {
16391636
Ok(Arc::new(WorkerExec {
16401637
input,
1641-
schema,
16421638
max_batch_rows,
16431639
limit_and_reverse,
16441640
}))
@@ -1651,9 +1647,6 @@ impl CubeExtensionPlanner {
16511647
#[derive(Debug)]
16521648
pub struct WorkerExec {
16531649
pub input: Arc<dyn ExecutionPlan>,
1654-
// TODO: remove and use `self.input.schema()`
1655-
// This is a hacky workaround for wrong schema of joins after projection pushdown.
1656-
pub schema: SchemaRef,
16571650
pub max_batch_rows: usize,
16581651
pub limit_and_reverse: Option<(usize, bool)>,
16591652
}
@@ -1670,10 +1663,6 @@ impl ExecutionPlan for WorkerExec {
16701663
self
16711664
}
16721665

1673-
fn schema(&self) -> SchemaRef {
1674-
self.schema.clone()
1675-
}
1676-
16771666
fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
16781667
vec![&self.input]
16791668
}
@@ -1683,9 +1672,9 @@ impl ExecutionPlan for WorkerExec {
16831672
children: Vec<Arc<dyn ExecutionPlan>>,
16841673
) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
16851674
assert_eq!(children.len(), 1);
1675+
let input = children.into_iter().next().unwrap();
16861676
Ok(Arc::new(WorkerExec {
1687-
input: children.into_iter().next().unwrap(),
1688-
schema: self.schema.clone(),
1677+
input,
16891678
max_batch_rows: self.max_batch_rows,
16901679
limit_and_reverse: self.limit_and_reverse.clone(),
16911680
}))

rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ use std::sync::Arc;
1818

1919
use crate::queryplanner::check_memory::CheckMemoryExec;
2020
use crate::queryplanner::filter_by_key_range::FilterByKeyRangeExec;
21+
use crate::queryplanner::merge_sort::LastRowByUniqueKeyExec;
2122
use crate::queryplanner::panic::{PanicWorkerExec, PanicWorkerNode};
2223
use crate::queryplanner::planning::{ClusterSendNode, Snapshot, WorkerExec};
24+
use crate::queryplanner::providers::InfoSchemaQueryCacheTableProvider;
2325
use crate::queryplanner::query_executor::{
2426
ClusterSendExec, CubeTable, CubeTableExec, InlineTableProvider,
2527
};
@@ -31,13 +33,13 @@ use crate::queryplanner::trace_data_loaded::TraceDataLoadedExec;
3133
use crate::queryplanner::{CubeTableLogical, InfoSchemaTableProvider};
3234
use datafusion::physical_plan::empty::EmptyExec;
3335
use datafusion::physical_plan::expressions::Column;
34-
use datafusion::physical_plan::joins::HashJoinExec;
36+
use datafusion::physical_plan::joins::{HashJoinExec, SortMergeJoinExec};
3537
use datafusion::physical_plan::memory::MemoryExec;
3638
use datafusion::physical_plan::projection::ProjectionExec;
3739
use datafusion::physical_plan::repartition::RepartitionExec;
3840
use datafusion::physical_plan::sorts::sort::SortExec;
41+
use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
3942
use datafusion::physical_plan::union::UnionExec;
40-
use crate::queryplanner::providers::InfoSchemaQueryCacheTableProvider;
4143

4244
#[derive(Default, Clone, Copy)]
4345
pub struct PPOptions {
@@ -306,7 +308,10 @@ fn pp_source(t: Arc<dyn TableProvider>) -> String {
306308
format!("InlineTableProvider(data: {} rows)", t.get_data().len())
307309
} else if let Some(t) = t.as_any().downcast_ref::<InfoSchemaTableProvider>() {
308310
format!("InfoSchemaTableProvider(table: {:?})", t.table)
309-
} else if let Some(_) = t.as_any().downcast_ref::<InfoSchemaQueryCacheTableProvider>() {
311+
} else if let Some(_) = t
312+
.as_any()
313+
.downcast_ref::<InfoSchemaQueryCacheTableProvider>()
314+
{
310315
"InfoSchemaQueryCacheTableProvider".to_string()
311316
} else {
312317
panic!("unknown table provider");
@@ -400,7 +405,7 @@ fn pp_phys_plan_indented(p: &dyn ExecutionPlan, indent: usize, o: &PPOptions, ou
400405
AggregateMode::Single => "Single",
401406
AggregateMode::SinglePartitioned => "SinglePartitioned",
402407
};
403-
*out += &format!("{}{}Aggregate", mode, strat);
408+
*out += &format!("{}{}Aggregate", strat, mode);
404409
if o.show_aggregations {
405410
*out += &format!(", aggs: {:?}", agg.aggr_expr())
406411
}
@@ -484,18 +489,17 @@ fn pp_phys_plan_indented(p: &dyn ExecutionPlan, indent: usize, o: &PPOptions, ou
484489
// TODO upgrade DF
485490
// } else if let Some(_) = a.downcast_ref::<MergeExec>() {
486491
// *out += "Merge";
487-
// } else if let Some(_) = a.downcast_ref::<MergeSortExec>() {
488-
// *out += "MergeSort";
492+
} else if let Some(_) = a.downcast_ref::<SortPreservingMergeExec>() {
493+
*out += "MergeSort";
489494
// } else if let Some(_) = a.downcast_ref::<MergeReSortExec>() {
490495
// *out += "MergeResort";
491-
// } else if let Some(j) = a.downcast_ref::<MergeJoinExec>() {
492-
// *out += &format!(
493-
// "MergeJoin, on: [{}]",
494-
// j.join_on()
495-
// .iter()
496-
// .map(|(l, r)| format!("{} = {}", l, r))
497-
// .join(", ")
498-
// );
496+
} else if let Some(j) = a.downcast_ref::<SortMergeJoinExec>() {
497+
*out += &format!(
498+
"MergeJoin, on: [{}]",
499+
j.on.iter()
500+
.map(|(l, r)| format!("{} = {}", l, r))
501+
.join(", ")
502+
);
499503
// } else if let Some(j) = a.downcast_ref::<CrossJoinExec>() {
500504
// *out += &format!("CrossJoin, on: {}", j.on)
501505
// } else if let Some(j) = a.downcast_ref::<CrossJoinAggExec>() {
@@ -522,8 +526,8 @@ fn pp_phys_plan_indented(p: &dyn ExecutionPlan, indent: usize, o: &PPOptions, ou
522526
// *out += "SkipRows";
523527
// } else if let Some(_) = a.downcast_ref::<RollingWindowAggExec>() {
524528
// *out += "RollingWindowAgg";
525-
// } else if let Some(_) = a.downcast_ref::<LastRowByUniqueKeyExec>() {
526-
// *out += "LastRowByUniqueKey";
529+
} else if let Some(_) = a.downcast_ref::<LastRowByUniqueKeyExec>() {
530+
*out += "LastRowByUniqueKey";
527531
} else if let Some(_) = a.downcast_ref::<MemoryExec>() {
528532
*out += "MemoryScan";
529533
} else if let Some(r) = a.downcast_ref::<RepartitionExec>() {
@@ -533,6 +537,9 @@ fn pp_phys_plan_indented(p: &dyn ExecutionPlan, indent: usize, o: &PPOptions, ou
533537
*out += &to_string.split(" ").next().unwrap_or(&to_string);
534538
}
535539

540+
// TODO upgrade DF - remove
541+
// *out += &format!(", schema: {}", p.schema());
542+
536543
// TODO upgrade DF
537544
// if o.show_output_hints {
538545
// let hints = p.output_hints();

rust/cubestore/cubestore/src/queryplanner/query_executor.rs

Lines changed: 40 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -360,13 +360,9 @@ impl QueryExecutorImpl {
360360
0,
361361
Arc::new(PreOptimizeRule::new(self.memory_handler.clone(), None)),
362362
);
363+
let config = Self::session_config();
363364
let session_state = SessionStateBuilder::new()
364-
.with_config(
365-
SessionConfig::new()
366-
.with_batch_size(4096)
367-
// TODO upgrade DF fails if bigger than 1
368-
.with_target_partitions(1),
369-
)
365+
.with_config(config)
370366
.with_runtime_env(runtime)
371367
.with_default_features()
372368
.with_query_planner(Arc::new(CubeQueryPlanner::new_on_router(
@@ -394,13 +390,9 @@ impl QueryExecutorImpl {
394390
data_loaded_size.clone(),
395391
)),
396392
);
393+
let config = Self::session_config();
397394
let session_state = SessionStateBuilder::new()
398-
.with_config(
399-
SessionConfig::new()
400-
.with_batch_size(4096)
401-
// TODO upgrade DF fails if bigger than 1
402-
.with_target_partitions(1),
403-
)
395+
.with_config(config)
404396
.with_runtime_env(runtime)
405397
.with_default_features()
406398
.with_query_planner(Arc::new(CubeQueryPlanner::new_on_worker(
@@ -413,6 +405,16 @@ impl QueryExecutorImpl {
413405
let ctx = SessionContext::new_with_state(session_state);
414406
Ok(Arc::new(ctx))
415407
}
408+
409+
fn session_config() -> SessionConfig {
410+
let mut config = SessionConfig::new()
411+
.with_batch_size(4096)
412+
// TODO upgrade DF if less than 2 then there will be no MergeJoin. Decide on repartitioning.
413+
.with_target_partitions(2)
414+
.with_prefer_existing_sort(true);
415+
config.options_mut().optimizer.prefer_hash_join = false;
416+
config
417+
}
416418
}
417419

418420
#[derive(Clone, Serialize, Deserialize)]
@@ -1144,7 +1146,6 @@ impl Debug for InlineTableProvider {
11441146
}
11451147

11461148
pub struct ClusterSendExec {
1147-
schema: SchemaRef,
11481149
properties: PlanProperties,
11491150
pub partitions: Vec<(
11501151
/*node*/ String,
@@ -1171,7 +1172,6 @@ pub enum InlineCompoundPartition {
11711172

11721173
impl ClusterSendExec {
11731174
pub fn new(
1174-
schema: SchemaRef,
11751175
cluster: Arc<dyn Cluster>,
11761176
serialized_plan: Arc<SerializedPlan>,
11771177
union_snapshots: &[Snapshots],
@@ -1183,13 +1183,10 @@ impl ClusterSendExec {
11831183
union_snapshots,
11841184
&serialized_plan.planning_meta().multi_part_subtree,
11851185
)?;
1186-
let eq_properties = EquivalenceProperties::new(schema.clone());
11871186
Ok(Self {
1188-
schema,
1189-
properties: PlanProperties::new(
1190-
eq_properties,
1191-
Partitioning::UnknownPartitioning(partitions.len()),
1192-
ExecutionMode::Bounded,
1187+
properties: Self::compute_properties(
1188+
input_for_optimizations.properties(),
1189+
partitions.len(),
11931190
),
11941191
partitions,
11951192
cluster,
@@ -1199,6 +1196,17 @@ impl ClusterSendExec {
11991196
})
12001197
}
12011198

1199+
fn compute_properties(
1200+
input_properties: &PlanProperties,
1201+
partitions_num: usize,
1202+
) -> PlanProperties {
1203+
PlanProperties::new(
1204+
input_properties.eq_properties.clone(),
1205+
Partitioning::UnknownPartitioning(partitions_num),
1206+
input_properties.execution_mode.clone(),
1207+
)
1208+
}
1209+
12021210
pub(crate) fn distribute_to_workers(
12031211
config: &dyn ConfigObj,
12041212
snapshots: &[Snapshots],
@@ -1406,14 +1414,12 @@ impl ClusterSendExec {
14061414
r
14071415
}
14081416

1409-
pub fn with_changed_schema(
1410-
&self,
1411-
schema: SchemaRef,
1412-
input_for_optimizations: Arc<dyn ExecutionPlan>,
1413-
) -> Self {
1417+
pub fn with_changed_schema(&self, input_for_optimizations: Arc<dyn ExecutionPlan>) -> Self {
14141418
ClusterSendExec {
1415-
schema,
1416-
properties: self.properties.clone(),
1419+
properties: Self::compute_properties(
1420+
input_for_optimizations.properties(),
1421+
self.partitions.len(),
1422+
),
14171423
partitions: self.partitions.clone(),
14181424
cluster: self.cluster.clone(),
14191425
serialized_plan: self.serialized_plan.clone(),
@@ -1462,10 +1468,6 @@ impl ExecutionPlan for ClusterSendExec {
14621468
self
14631469
}
14641470

1465-
fn schema(&self) -> SchemaRef {
1466-
self.schema.clone()
1467-
}
1468-
14691471
fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
14701472
vec![&self.input_for_optimizations]
14711473
}
@@ -1479,8 +1481,10 @@ impl ExecutionPlan for ClusterSendExec {
14791481
}
14801482
let input_for_optimizations = children.into_iter().next().unwrap();
14811483
Ok(Arc::new(ClusterSendExec {
1482-
schema: self.schema.clone(),
1483-
properties: self.properties.clone(),
1484+
properties: Self::compute_properties(
1485+
input_for_optimizations.properties(),
1486+
self.partitions.len(),
1487+
),
14841488
partitions: self.partitions.clone(),
14851489
cluster: self.cluster.clone(),
14861490
serialized_plan: self.serialized_plan.clone(),
@@ -1500,7 +1504,7 @@ impl ExecutionPlan for ClusterSendExec {
15001504
let plan = self.serialized_plan_for_partitions(partitions);
15011505

15021506
let cluster = self.cluster.clone();
1503-
let schema = self.schema.clone();
1507+
let schema = self.properties.eq_properties.schema().clone();
15041508
let node_name = node_name.to_string();
15051509
if self.use_streaming {
15061510
// A future that yields a stream
@@ -1554,7 +1558,8 @@ impl fmt::Debug for ClusterSendExec {
15541558
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
15551559
f.write_fmt(format_args!(
15561560
"ClusterSendExec: {:?}: {:?}",
1557-
self.schema, self.partitions
1561+
self.properties.eq_properties.schema(),
1562+
self.partitions
15581563
))
15591564
}
15601565
}

0 commit comments

Comments
 (0)