From 32c4c24a7616fb4d52d8ab4c07b24b2db68999d7 Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Tue, 19 Nov 2024 19:08:01 -0500 Subject: [PATCH 1/6] remove join --- optd-cost-model/src/cost/join.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 optd-cost-model/src/cost/join.rs diff --git a/optd-cost-model/src/cost/join.rs b/optd-cost-model/src/cost/join.rs deleted file mode 100644 index e69de29..0000000 From a4768d9a958e9cf6c45f71cb135f991f9d1d2b1b Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Tue, 19 Nov 2024 19:08:59 -0500 Subject: [PATCH 2/6] add get_table_row_count in storage manager --- optd-cost-model/src/lib.rs | 9 ++++++++- optd-cost-model/src/storage/mock.rs | 5 +++++ optd-cost-model/src/storage/mod.rs | 2 ++ optd-cost-model/src/storage/persistent.rs | 9 +++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/optd-cost-model/src/lib.rs b/optd-cost-model/src/lib.rs index 13774b2..336a8e9 100644 --- a/optd-cost-model/src/lib.rs +++ b/optd-cost-model/src/lib.rs @@ -52,6 +52,7 @@ pub enum SemanticError { pub enum CostModelError { ORMError(BackendError), SemanticError(SemanticError), + SerdeError(serde_json::Error), } impl From for CostModelError { @@ -66,6 +67,12 @@ impl From for CostModelError { } } +impl From for CostModelError { + fn from(err: serde_json::Error) -> Self { + CostModelError::SerdeError(err) + } +} + pub trait CostModel: 'static + Send + Sync { /// TODO: documentation fn compute_operation_cost( @@ -85,7 +92,7 @@ pub trait CostModel: 'static + Send + Sync { &self, node: PhysicalNodeType, predicates: &[ArcPredicateNode], - children_statistics: &[Option<&EstimatedStatistic>], + children_stats: &[Option<&EstimatedStatistic>], context: ComputeCostContext, ) -> CostModelResult; diff --git a/optd-cost-model/src/storage/mock.rs b/optd-cost-model/src/storage/mock.rs index d878bcb..e2c9b1e 100644 --- a/optd-cost-model/src/storage/mock.rs +++ b/optd-cost-model/src/storage/mock.rs @@ -58,4 +58,9 @@ impl CostModelStorageManager for CostModelStorageMockManagerImpl { }, } } + + async fn get_table_row_count(&self, table_id: TableId) -> CostModelResult> { + let table_stats = self.per_table_stats_map.get(&table_id); + Ok(table_stats.map(|stats| stats.row_cnt)) + } } diff --git a/optd-cost-model/src/storage/mod.rs b/optd-cost-model/src/storage/mod.rs index d3d26cd..311da44 100644 --- a/optd-cost-model/src/storage/mod.rs +++ b/optd-cost-model/src/storage/mod.rs @@ -10,4 +10,6 @@ pub trait CostModelStorageManager { table_id: TableId, attr_base_indices: &[u64], ) -> CostModelResult>; + + async fn get_table_row_count(&self, table_id: TableId) -> CostModelResult>; } diff --git a/optd-cost-model/src/storage/persistent.rs b/optd-cost-model/src/storage/persistent.rs index e029270..e9459f5 100644 --- a/optd-cost-model/src/storage/persistent.rs +++ b/optd-cost-model/src/storage/persistent.rs @@ -116,5 +116,14 @@ impl CostModelStorageManager ))) } + async fn get_table_row_count(&self, table_id: TableId) -> CostModelResult> { + Ok(self + .backend_manager + .get_stats_for_table(table_id.into(), StatType::TableRowCount, None) + .await? + .map(|json| serde_json::from_value(json)) + .transpose()?) + } + // TODO: Support querying for a specific type of statistics. } From c864fbc1d3580d6779138acb0ab0489ee7c8055a Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Tue, 19 Nov 2024 19:22:37 -0500 Subject: [PATCH 3/6] make join statistic computation accept EstimatedStatistic as parameter --- optd-cost-model/src/cost/join/hash_join.rs | 10 +++++----- optd-cost-model/src/cost/join/nested_loop_join.rs | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/optd-cost-model/src/cost/join/hash_join.rs b/optd-cost-model/src/cost/join/hash_join.rs index 47c9ebd..dff6b31 100644 --- a/optd-cost-model/src/cost/join/hash_join.rs +++ b/optd-cost-model/src/cost/join/hash_join.rs @@ -20,8 +20,8 @@ impl CostModelImpl { &self, join_typ: JoinType, group_id: GroupId, - left_row_cnt: f64, - right_row_cnt: f64, + left_row_cnt: EstimatedStatistic, + right_row_cnt: EstimatedStatistic, left_group_id: GroupId, right_group_id: GroupId, left_keys: ListPred, @@ -42,14 +42,14 @@ impl CostModelImpl { right_keys, output_attr_refs.attr_refs(), input_correlation, - left_row_cnt, - right_row_cnt, + left_row_cnt.0, + right_row_cnt.0, left_attr_cnt, ) .await? }; Ok(EstimatedStatistic( - (left_row_cnt * right_row_cnt * selectivity).max(1.0), + (left_row_cnt.0 * right_row_cnt.0 * selectivity).max(1.0), )) } } diff --git a/optd-cost-model/src/cost/join/nested_loop_join.rs b/optd-cost-model/src/cost/join/nested_loop_join.rs index ebb70c9..9a56ca4 100644 --- a/optd-cost-model/src/cost/join/nested_loop_join.rs +++ b/optd-cost-model/src/cost/join/nested_loop_join.rs @@ -18,8 +18,8 @@ impl CostModelImpl { &self, join_typ: JoinType, group_id: GroupId, - left_row_cnt: f64, - right_row_cnt: f64, + left_row_cnt: EstimatedStatistic, + right_row_cnt: EstimatedStatistic, left_group_id: GroupId, right_group_id: GroupId, join_cond: ArcPredicateNode, @@ -36,13 +36,13 @@ impl CostModelImpl { join_cond, output_attr_refs.attr_refs(), input_correlation, - left_row_cnt, - right_row_cnt, + left_row_cnt.0, + right_row_cnt.0, ) .await? }; Ok(EstimatedStatistic( - (left_row_cnt * right_row_cnt * selectivity).max(1.0), + (left_row_cnt.0 * right_row_cnt.0 * selectivity).max(1.0), )) } } From 49baa5b9a6dba25b880c1dc467c5f477652bcb84 Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Tue, 19 Nov 2024 19:27:45 -0500 Subject: [PATCH 4/6] remove allow unused --- optd-cost-model/src/cost/agg.rs | 17 ++++++----------- optd-cost-model/src/cost/filter/comp_op.rs | 2 +- optd-cost-model/src/cost/filter/core.rs | 9 ++------- optd-cost-model/src/cost/filter/in_list.rs | 11 ++--------- optd-cost-model/src/cost/filter/like.rs | 5 +---- optd-cost-model/src/cost/filter/log_op.rs | 2 +- optd-cost-model/src/cost/join/core.rs | 15 +++++++-------- optd-cost-model/src/cost/join/hash_join.rs | 9 +-------- .../src/cost/join/nested_loop_join.rs | 4 +--- optd-cost-model/src/cost/mod.rs | 2 -- 10 files changed, 22 insertions(+), 54 deletions(-) diff --git a/optd-cost-model/src/cost/agg.rs b/optd-cost-model/src/cost/agg.rs index f5edc7a..169a3b0 100644 --- a/optd-cost-model/src/cost/agg.rs +++ b/optd-cost-model/src/cost/agg.rs @@ -8,7 +8,7 @@ use crate::{ cost_model::CostModelImpl, stats::DEFAULT_NUM_DISTINCT, storage::CostModelStorageManager, - CostModelError, CostModelResult, EstimatedStatistic, SemanticError, + CostModelResult, EstimatedStatistic, SemanticError, }; impl CostModelImpl { @@ -67,19 +67,14 @@ impl CostModelImpl { #[cfg(test)] mod tests { - use std::{collections::HashMap, ops::Deref}; + use std::collections::HashMap; use crate::{ - common::{ - predicates::constant_pred::ConstantType, - properties::Attribute, - types::{GroupId, TableId}, - values::Value, - }, + common::predicates::constant_pred::ConstantType, cost_model::tests::{ - attr_index, cnst, create_mock_cost_model, create_mock_cost_model_with_attr_types, - empty_list, empty_per_attr_stats, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX, - TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID, + attr_index, create_mock_cost_model_with_attr_types, empty_list, list, + TestPerAttributeStats, TEST_ATTR1_BASE_INDEX, TEST_ATTR2_BASE_INDEX, + TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID, }, stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT}, EstimatedStatistic, diff --git a/optd-cost-model/src/cost/filter/comp_op.rs b/optd-cost-model/src/cost/filter/comp_op.rs index 5270819..526c901 100644 --- a/optd-cost-model/src/cost/filter/comp_op.rs +++ b/optd-cost-model/src/cost/filter/comp_op.rs @@ -14,7 +14,7 @@ use crate::{ cost_model::CostModelImpl, stats::{DEFAULT_EQ_SEL, DEFAULT_INEQ_SEL, UNIMPLEMENTED_SEL}, storage::CostModelStorageManager, - CostModelResult, SemanticError, + CostModelResult, }; impl CostModelImpl { diff --git a/optd-cost-model/src/cost/filter/core.rs b/optd-cost-model/src/cost/filter/core.rs index 05363e4..0ee90bb 100644 --- a/optd-cost-model/src/cost/filter/core.rs +++ b/optd-cost-model/src/cost/filter/core.rs @@ -98,16 +98,11 @@ mod tests { bin_op_pred::BinOpType, constant_pred::ConstantType, log_op_pred::LogOpType, un_op_pred::UnOpType, }, - properties::Attribute, types::TableId, values::Value, }, cost_model::tests::*, - memo_ext::tests::MemoGroupInfo, - stats::{ - utilities::{counter::Counter, simple_map::SimpleMap}, - Distribution, MostCommonValues, DEFAULT_EQ_SEL, - }, + stats::{utilities::simple_map::SimpleMap, Distribution, MostCommonValues, DEFAULT_EQ_SEL}, }; use arrow_schema::DataType; @@ -834,7 +829,7 @@ mod tests { 0, 0.0, ); - let table_id = TableId(0); + let cost_model = create_mock_cost_model_with_attr_types( vec![TEST_TABLE1_ID], vec![HashMap::from([( diff --git a/optd-cost-model/src/cost/filter/in_list.rs b/optd-cost-model/src/cost/filter/in_list.rs index f056fb1..1c8ad46 100644 --- a/optd-cost-model/src/cost/filter/in_list.rs +++ b/optd-cost-model/src/cost/filter/in_list.rs @@ -83,16 +83,9 @@ mod tests { use std::collections::HashMap; use crate::{ - common::{ - types::{GroupId, TableId}, - values::Value, - }, + common::values::Value, cost_model::tests::*, - memo_ext::tests::MemoGroupInfo, - stats::{ - utilities::{counter::Counter, simple_map::SimpleMap}, - MostCommonValues, - }, + stats::{utilities::simple_map::SimpleMap, MostCommonValues}, }; #[tokio::test] diff --git a/optd-cost-model/src/cost/filter/like.rs b/optd-cost-model/src/cost/filter/like.rs index 32800e4..db55caa 100644 --- a/optd-cost-model/src/cost/filter/like.rs +++ b/optd-cost-model/src/cost/filter/like.rs @@ -113,10 +113,7 @@ mod tests { use std::collections::HashMap; use crate::{ - common::{ - types::{GroupId, TableId}, - values::Value, - }, + common::values::Value, cost_model::tests::*, stats::{ utilities::{counter::Counter, simple_map::SimpleMap}, diff --git a/optd-cost-model/src/cost/filter/log_op.rs b/optd-cost-model/src/cost/filter/log_op.rs index 61862a2..381584d 100644 --- a/optd-cost-model/src/cost/filter/log_op.rs +++ b/optd-cost-model/src/cost/filter/log_op.rs @@ -25,7 +25,7 @@ impl CostModelImpl { let mut or_sel_neg = 1.0; for child in children { let selectivity = self.get_filter_selectivity(group_id, child.clone()).await?; - or_sel_neg *= (1.0 - selectivity); + or_sel_neg *= 1.0 - selectivity; } Ok(1.0 - or_sel_neg) } diff --git a/optd-cost-model/src/cost/join/core.rs b/optd-cost-model/src/cost/join/core.rs index c68c1db..5fcaa37 100644 --- a/optd-cost-model/src/cost/join/core.rs +++ b/optd-cost-model/src/cost/join/core.rs @@ -7,13 +7,11 @@ use crate::{ nodes::{ArcPredicateNode, JoinType, PredicateType, ReprPredicateNode}, predicates::{ attr_index_pred::AttrIndexPred, - bin_op_pred::BinOpType, list_pred::ListPred, log_op_pred::{LogOpPred, LogOpType}, }, properties::attr_ref::{ - self, AttrRef, AttrRefs, BaseTableAttrRef, EqPredicate, GroupAttrRefs, - SemanticCorrelation, + AttrRef, AttrRefs, BaseTableAttrRef, EqPredicate, SemanticCorrelation, }, types::GroupId, }, @@ -409,11 +407,12 @@ impl CostModelImpl { mod tests { use std::collections::HashMap; + use attr_ref::GroupAttrRefs; + use crate::{ common::{ - predicates::{attr_index_pred, constant_pred::ConstantType}, - properties::Attribute, - types::TableId, + predicates::bin_op_pred::BinOpType, + properties::{attr_ref, Attribute}, values::Value, }, cost_model::tests::{ @@ -421,8 +420,8 @@ mod tests { create_three_table_mock_cost_model, create_two_table_mock_cost_model, create_two_table_mock_cost_model_custom_row_cnts, empty_per_attr_stats, log_op, per_attr_stats_with_dist_and_ndistinct, per_attr_stats_with_ndistinct, - TestOptCostModelMock, TestPerAttributeStats, TEST_ATTR1_NAME, TEST_ATTR2_NAME, - TEST_TABLE1_ID, TEST_TABLE2_ID, TEST_TABLE3_ID, TEST_TABLE4_ID, + TestOptCostModelMock, TEST_ATTR1_NAME, TEST_ATTR2_NAME, TEST_TABLE1_ID, TEST_TABLE2_ID, + TEST_TABLE3_ID, TEST_TABLE4_ID, }, memo_ext::tests::MemoGroupInfo, stats::DEFAULT_EQ_SEL, diff --git a/optd-cost-model/src/cost/join/hash_join.rs b/optd-cost-model/src/cost/join/hash_join.rs index dff6b31..f92f247 100644 --- a/optd-cost-model/src/cost/join/hash_join.rs +++ b/optd-cost-model/src/cost/join/hash_join.rs @@ -1,12 +1,5 @@ -use itertools::Itertools; - use crate::{ - common::{ - nodes::{JoinType, ReprPredicateNode}, - predicates::{attr_index_pred::AttrIndexPred, list_pred::ListPred}, - properties::attr_ref::{AttrRefs, SemanticCorrelation}, - types::GroupId, - }, + common::{nodes::JoinType, predicates::list_pred::ListPred, types::GroupId}, cost_model::CostModelImpl, storage::CostModelStorageManager, CostModelResult, EstimatedStatistic, diff --git a/optd-cost-model/src/cost/join/nested_loop_join.rs b/optd-cost-model/src/cost/join/nested_loop_join.rs index 9a56ca4..ff6a2b1 100644 --- a/optd-cost-model/src/cost/join/nested_loop_join.rs +++ b/optd-cost-model/src/cost/join/nested_loop_join.rs @@ -1,8 +1,6 @@ use crate::{ common::{ - nodes::{ArcPredicateNode, JoinType, PredicateType, ReprPredicateNode}, - predicates::log_op_pred::{LogOpPred, LogOpType}, - properties::attr_ref::{AttrRefs, SemanticCorrelation}, + nodes::{ArcPredicateNode, JoinType}, types::GroupId, }, cost_model::CostModelImpl, diff --git a/optd-cost-model/src/cost/mod.rs b/optd-cost-model/src/cost/mod.rs index c98d7d7..b55d449 100644 --- a/optd-cost-model/src/cost/mod.rs +++ b/optd-cost-model/src/cost/mod.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - pub mod agg; pub mod filter; pub mod join; From 7f947d37d479cc465cfad83be4cc63d6468c8b9e Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Tue, 19 Nov 2024 19:42:42 -0500 Subject: [PATCH 5/6] implement cost model derive statistics --- Cargo.lock | 1 + optd-cost-model/Cargo.toml | 1 + optd-cost-model/src/cost/join/core.rs | 3 +- optd-cost-model/src/cost_model.rs | 76 +++++++++++++++++++++++---- optd-cost-model/src/lib.rs | 19 +++---- 5 files changed, 80 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b54d901..e81b740 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2246,6 +2246,7 @@ version = "0.1.0" dependencies = [ "arrow-schema 53.2.0", "assert_approx_eq", + "async-trait", "chrono", "crossbeam", "datafusion", diff --git a/optd-cost-model/Cargo.toml b/optd-cost-model/Cargo.toml index e8b22aa..1812f8d 100644 --- a/optd-cost-model/Cargo.toml +++ b/optd-cost-model/Cargo.toml @@ -18,6 +18,7 @@ itertools = "0.13" assert_approx_eq = "1.1.0" trait-variant = "0.1.2" tokio = { version = "1.0.1", features = ["macros", "rt-multi-thread"] } +async-trait = "0.1" [dev-dependencies] crossbeam = "0.8" diff --git a/optd-cost-model/src/cost/join/core.rs b/optd-cost-model/src/cost/join/core.rs index 5fcaa37..fc4bc29 100644 --- a/optd-cost-model/src/cost/join/core.rs +++ b/optd-cost-model/src/cost/join/core.rs @@ -904,7 +904,8 @@ mod tests { expected_inner_sel ); // check the outer sels - assert_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &attr_refs, 0.25, 0.2); + assert_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &attr_refs, 0.25, 0.2) + .await; } /// Non-unique oncond means the column is not unique in either table diff --git a/optd-cost-model/src/cost_model.rs b/optd-cost-model/src/cost_model.rs index 4583484..125aa09 100644 --- a/optd-cost-model/src/cost_model.rs +++ b/optd-cost-model/src/cost_model.rs @@ -9,7 +9,8 @@ use optd_persistent::{ use crate::{ common::{ - nodes::{ArcPredicateNode, PhysicalNodeType}, + nodes::{ArcPredicateNode, PhysicalNodeType, ReprPredicateNode}, + predicates::list_pred::ListPred, types::{AttrId, EpochId, ExprId, TableId}, }, memo_ext::MemoExt, @@ -40,28 +41,83 @@ impl CostModelImpl { } } +#[async_trait::async_trait] impl CostModel for CostModelImpl { - fn compute_operation_cost( + async fn compute_operation_cost( &self, node: &PhysicalNodeType, predicates: &[ArcPredicateNode], - children_stats: &[Option<&EstimatedStatistic>], + children_stats: &[EstimatedStatistic], context: ComputeCostContext, ) -> CostModelResult { todo!() } - fn derive_statistics( + async fn derive_statistics( &self, node: PhysicalNodeType, predicates: &[ArcPredicateNode], - children_statistics: &[Option<&EstimatedStatistic>], + children_statistics: &[EstimatedStatistic], context: ComputeCostContext, ) -> CostModelResult { - todo!() + match node { + PhysicalNodeType::PhysicalScan => { + let table_id = TableId(predicates[0].data.as_ref().unwrap().as_u64()); + let row_cnt = self + .storage_manager + .get_table_row_count(table_id) + .await? + .unwrap_or(1) as f64; + Ok(EstimatedStatistic(row_cnt)) + } + PhysicalNodeType::PhysicalEmptyRelation => Ok(EstimatedStatistic(0.01)), + PhysicalNodeType::PhysicalLimit => { + self.get_limit_row_cnt(children_statistics[0].clone(), predicates[1].clone()) + } + PhysicalNodeType::PhysicalFilter => { + self.get_filter_row_cnt( + children_statistics[0].clone(), + context.group_id, + predicates[0].clone(), + ) + .await + } + PhysicalNodeType::PhysicalNestedLoopJoin(join_typ) => { + self.get_nlj_row_cnt( + join_typ, + context.group_id, + children_statistics[0].clone(), + children_statistics[1].clone(), + context.children_group_ids[0], + context.children_group_ids[1], + predicates[0].clone(), + ) + .await + } + PhysicalNodeType::PhysicalHashJoin(join_typ) => { + self.get_hash_join_row_cnt( + join_typ, + context.group_id, + children_statistics[0].clone(), + children_statistics[1].clone(), + context.children_group_ids[0], + context.children_group_ids[1], + ListPred::from_pred_node(predicates[0].clone()).unwrap(), + ListPred::from_pred_node(predicates[1].clone()).unwrap(), + ) + .await + } + PhysicalNodeType::PhysicalAgg => { + self.get_agg_row_cnt(context.group_id, predicates[1].clone()) + .await + } + PhysicalNodeType::PhysicalSort | PhysicalNodeType::PhysicalProjection => { + Ok(children_statistics[0].clone()) + } + } } - fn update_statistics( + async fn update_statistics( &self, stats: Vec, source: String, @@ -70,7 +126,7 @@ impl CostModel for CostModel todo!() } - fn get_table_statistic_for_analysis( + async fn get_table_statistic_for_analysis( &self, table_id: TableId, stat_type: StatType, @@ -79,7 +135,7 @@ impl CostModel for CostModel todo!() } - fn get_attribute_statistic_for_analysis( + async fn get_attribute_statistic_for_analysis( &self, attr_ids: Vec, stat_type: StatType, @@ -88,7 +144,7 @@ impl CostModel for CostModel todo!() } - fn get_cost_for_analysis( + async fn get_cost_for_analysis( &self, expr_id: ExprId, epoch_id: Option, diff --git a/optd-cost-model/src/lib.rs b/optd-cost-model/src/lib.rs index 336a8e9..0b19737 100644 --- a/optd-cost-model/src/lib.rs +++ b/optd-cost-model/src/lib.rs @@ -33,7 +33,7 @@ pub struct Cost(pub Vec); /// Estimated statistic calculated by the cost model. /// It is the estimated output row count of the targeted expression. -#[derive(PartialEq, PartialOrd, Debug)] +#[derive(PartialEq, PartialOrd, Clone, Debug)] pub struct EstimatedStatistic(pub f64); pub type CostModelResult = Result; @@ -73,13 +73,14 @@ impl From for CostModelError { } } +#[async_trait::async_trait] pub trait CostModel: 'static + Send + Sync { /// TODO: documentation - fn compute_operation_cost( + async fn compute_operation_cost( &self, node: &PhysicalNodeType, predicates: &[ArcPredicateNode], - children_stats: &[Option<&EstimatedStatistic>], + children_stats: &[EstimatedStatistic], context: ComputeCostContext, ) -> CostModelResult; @@ -88,18 +89,18 @@ pub trait CostModel: 'static + Send + Sync { /// statistic calculated by the cost model. /// TODO: Consider make it a helper function, so we can store Cost in the /// ORM more easily. - fn derive_statistics( + async fn derive_statistics( &self, node: PhysicalNodeType, predicates: &[ArcPredicateNode], - children_stats: &[Option<&EstimatedStatistic>], + children_stats: &[EstimatedStatistic], context: ComputeCostContext, ) -> CostModelResult; /// TODO: documentation /// It is for **REAL** statistic updates, not for estimated statistics. /// TODO: Change data from String to other types. - fn update_statistics( + async fn update_statistics( &self, stats: Vec, source: String, @@ -107,7 +108,7 @@ pub trait CostModel: 'static + Send + Sync { ) -> CostModelResult<()>; /// TODO: documentation - fn get_table_statistic_for_analysis( + async fn get_table_statistic_for_analysis( &self, table_id: TableId, stat_type: StatType, @@ -115,7 +116,7 @@ pub trait CostModel: 'static + Send + Sync { ) -> CostModelResult>; /// TODO: documentation - fn get_attribute_statistic_for_analysis( + async fn get_attribute_statistic_for_analysis( &self, attr_ids: Vec, stat_type: StatType, @@ -123,7 +124,7 @@ pub trait CostModel: 'static + Send + Sync { ) -> CostModelResult>; /// TODO: documentation - fn get_cost_for_analysis( + async fn get_cost_for_analysis( &self, expr_id: ExprId, epoch_id: Option, From 864b753614724da6a14e995b7d7d17843f6d4e46 Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Tue, 19 Nov 2024 19:52:31 -0500 Subject: [PATCH 6/6] move test-related stuffs to test_utils --- optd-cost-model/src/cost/agg.rs | 4 +- optd-cost-model/src/cost/filter/core.rs | 2 +- optd-cost-model/src/cost/filter/in_list.rs | 2 +- optd-cost-model/src/cost/filter/like.rs | 2 +- optd-cost-model/src/cost/join/core.rs | 6 +- optd-cost-model/src/cost_model.rs | 516 ------------------- optd-cost-model/src/lib.rs | 1 + optd-cost-model/src/memo_ext.rs | 66 --- optd-cost-model/src/storage/persistent.rs | 2 +- optd-cost-model/src/test_utils.rs | 565 +++++++++++++++++++++ 10 files changed, 575 insertions(+), 591 deletions(-) create mode 100644 optd-cost-model/src/test_utils.rs diff --git a/optd-cost-model/src/cost/agg.rs b/optd-cost-model/src/cost/agg.rs index 169a3b0..62c88d3 100644 --- a/optd-cost-model/src/cost/agg.rs +++ b/optd-cost-model/src/cost/agg.rs @@ -71,12 +71,12 @@ mod tests { use crate::{ common::predicates::constant_pred::ConstantType, - cost_model::tests::{ + stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT}, + test_utils::tests::{ attr_index, create_mock_cost_model_with_attr_types, empty_list, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX, TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID, }, - stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT}, EstimatedStatistic, }; diff --git a/optd-cost-model/src/cost/filter/core.rs b/optd-cost-model/src/cost/filter/core.rs index 0ee90bb..44651ed 100644 --- a/optd-cost-model/src/cost/filter/core.rs +++ b/optd-cost-model/src/cost/filter/core.rs @@ -101,8 +101,8 @@ mod tests { types::TableId, values::Value, }, - cost_model::tests::*, stats::{utilities::simple_map::SimpleMap, Distribution, MostCommonValues, DEFAULT_EQ_SEL}, + test_utils::tests::*, }; use arrow_schema::DataType; diff --git a/optd-cost-model/src/cost/filter/in_list.rs b/optd-cost-model/src/cost/filter/in_list.rs index 1c8ad46..d8838a8 100644 --- a/optd-cost-model/src/cost/filter/in_list.rs +++ b/optd-cost-model/src/cost/filter/in_list.rs @@ -84,8 +84,8 @@ mod tests { use crate::{ common::values::Value, - cost_model::tests::*, stats::{utilities::simple_map::SimpleMap, MostCommonValues}, + test_utils::tests::*, }; #[tokio::test] diff --git a/optd-cost-model/src/cost/filter/like.rs b/optd-cost-model/src/cost/filter/like.rs index db55caa..ff56f44 100644 --- a/optd-cost-model/src/cost/filter/like.rs +++ b/optd-cost-model/src/cost/filter/like.rs @@ -114,11 +114,11 @@ mod tests { use crate::{ common::values::Value, - cost_model::tests::*, stats::{ utilities::{counter::Counter, simple_map::SimpleMap}, MostCommonValues, FIXED_CHAR_SEL_FACTOR, FULL_WILDCARD_SEL_FACTOR, }, + test_utils::tests::*, }; #[tokio::test] diff --git a/optd-cost-model/src/cost/join/core.rs b/optd-cost-model/src/cost/join/core.rs index fc4bc29..6d10fb9 100644 --- a/optd-cost-model/src/cost/join/core.rs +++ b/optd-cost-model/src/cost/join/core.rs @@ -415,7 +415,9 @@ mod tests { properties::{attr_ref, Attribute}, values::Value, }, - cost_model::tests::{ + stats::DEFAULT_EQ_SEL, + test_utils::tests::MemoGroupInfo, + test_utils::tests::{ attr_index, bin_op, cnst, create_four_table_mock_cost_model, create_mock_cost_model, create_three_table_mock_cost_model, create_two_table_mock_cost_model, create_two_table_mock_cost_model_custom_row_cnts, empty_per_attr_stats, log_op, @@ -423,8 +425,6 @@ mod tests { TestOptCostModelMock, TEST_ATTR1_NAME, TEST_ATTR2_NAME, TEST_TABLE1_ID, TEST_TABLE2_ID, TEST_TABLE3_ID, TEST_TABLE4_ID, }, - memo_ext::tests::MemoGroupInfo, - stats::DEFAULT_EQ_SEL, }; use super::*; diff --git a/optd-cost-model/src/cost_model.rs b/optd-cost-model/src/cost_model.rs index 125aa09..9ae84bb 100644 --- a/optd-cost-model/src/cost_model.rs +++ b/optd-cost-model/src/cost_model.rs @@ -167,519 +167,3 @@ impl CostModelImpl { .await } } - -/// I thought about using the system's own parser and planner to generate these expression trees, -/// but this is not currently feasible because it would create a cyclic dependency between -/// optd-datafusion-bridge and optd-datafusion-repr -#[cfg(test)] -pub mod tests { - use std::{collections::HashMap, hash::Hash}; - - use arrow_schema::DataType; - use itertools::Itertools; - use optd_persistent::cost_model::interface::CatalogSource; - use serde::{Deserialize, Serialize}; - - use crate::{ - common::{ - nodes::ReprPredicateNode, - predicates::{ - attr_index_pred::AttrIndexPred, - bin_op_pred::{BinOpPred, BinOpType}, - cast_pred::CastPred, - constant_pred::{ConstantPred, ConstantType}, - in_list_pred::InListPred, - like_pred::LikePred, - list_pred::ListPred, - log_op_pred::{LogOpPred, LogOpType}, - un_op_pred::{UnOpPred, UnOpType}, - }, - properties::{ - attr_ref::{AttrRef, GroupAttrRefs}, - schema::Schema, - Attribute, - }, - types::GroupId, - values::Value, - }, - memo_ext::tests::{MemoGroupInfo, MockMemoExtImpl}, - stats::{ - utilities::{counter::Counter, simple_map::SimpleMap}, - AttributeCombValueStats, Distribution, MostCommonValues, - }, - storage::mock::{CostModelStorageMockManagerImpl, TableStats}, - }; - - use super::*; - - pub const TEST_TABLE1_ID: TableId = TableId(0); - pub const TEST_TABLE2_ID: TableId = TableId(1); - pub const TEST_TABLE3_ID: TableId = TableId(2); - pub const TEST_TABLE4_ID: TableId = TableId(3); - - pub const TEST_GROUP1_ID: GroupId = GroupId(0); - pub const TEST_GROUP2_ID: GroupId = GroupId(1); - pub const TEST_GROUP3_ID: GroupId = GroupId(2); - pub const TEST_GROUP4_ID: GroupId = GroupId(3); - - // This is base index rather than ref index. - pub const TEST_ATTR1_BASE_INDEX: u64 = 0; - pub const TEST_ATTR2_BASE_INDEX: u64 = 1; - pub const TEST_ATTR3_BASE_INDEX: u64 = 2; - - pub const TEST_ATTR1_NAME: &str = "attr1"; - pub const TEST_ATTR2_NAME: &str = "attr2"; - pub const TEST_ATTR3_NAME: &str = "attr3"; - pub const TEST_ATTR4_NAME: &str = "attr4"; - - pub type TestPerAttributeStats = AttributeCombValueStats; - // TODO: add tests for non-mock storage manager - pub type TestOptCostModelMock = CostModelImpl; - - // Use this method, we only create one group `TEST_GROUP1_ID` in the memo. - // We put the first attribute in the first table as the ref index 0 in the group. - // And put the second attribute in the first table as the ref index 1 in the group. - // etc. - // The orders of attributes and tables are defined by the order of their ids (smaller first). - pub fn create_mock_cost_model( - table_id: Vec, - // u64 should be base attribute index. - per_attribute_stats: Vec>, - row_counts: Vec>, - ) -> TestOptCostModelMock { - let attr_ids: Vec<(TableId, u64, Option)> = per_attribute_stats - .iter() - .enumerate() - .map(|(idx, m)| (table_id[idx], m)) - .flat_map(|(table_id, m)| { - m.iter() - .map(|(attr_idx, _)| (table_id, *attr_idx, None)) - .collect_vec() - }) - .sorted_by_key(|(table_id, attr_idx, _)| (*table_id, *attr_idx)) - .collect(); - create_mock_cost_model_with_memo( - table_id.clone(), - per_attribute_stats, - row_counts, - create_one_group_all_base_attributes_mock_memo(attr_ids), - ) - } - - pub fn create_mock_cost_model_with_attr_types( - table_id: Vec, - // u64 should be base attribute index. - per_attribute_stats: Vec>, - attributes: Vec>, - row_counts: Vec>, - ) -> TestOptCostModelMock { - let attr_ids: Vec<(TableId, u64, Option)> = attributes - .iter() - .enumerate() - .map(|(idx, m)| (table_id[idx], m)) - .flat_map(|(table_id, m)| { - m.iter() - .map(|(attr_idx, typ)| (table_id, *attr_idx, Some(*typ))) - .collect_vec() - }) - .sorted_by_key(|(table_id, attr_idx, _)| (*table_id, *attr_idx)) - .collect(); - create_mock_cost_model_with_memo( - table_id.clone(), - per_attribute_stats, - row_counts, - create_one_group_all_base_attributes_mock_memo(attr_ids), - ) - } - - pub fn create_mock_cost_model_with_memo( - table_id: Vec, - per_attribute_stats: Vec>, - row_counts: Vec>, - memo: MockMemoExtImpl, - ) -> TestOptCostModelMock { - let storage_manager = CostModelStorageMockManagerImpl::new( - table_id - .into_iter() - .zip(per_attribute_stats) - .zip(row_counts) - .map(|((table_id, per_attr_stats), row_count)| { - ( - table_id, - TableStats::new( - row_count.unwrap_or(100), - per_attr_stats - .into_iter() - .map(|(attr_idx, stats)| (vec![attr_idx], stats)) - .collect(), - ), - ) - }) - .collect(), - ); - CostModelImpl::new(storage_manager, CatalogSource::Mock, Arc::new(memo)) - } - - // attributes: Vec<(TableId, AttrBaseIndex)> - pub fn create_one_group_all_base_attributes_mock_memo( - attr_ids: Vec<(TableId, u64, Option)>, - ) -> MockMemoExtImpl { - let group_info = MemoGroupInfo::new( - Schema::new( - attr_ids - .clone() - .into_iter() - .map(|(_, _, typ)| Attribute { - name: "attr".to_string(), - typ: typ.unwrap_or(ConstantType::Int64), - nullable: false, - }) - .collect(), - ), - GroupAttrRefs::new( - attr_ids - .into_iter() - .map(|(table_id, attr_base_index, _)| { - AttrRef::new_base_table_attr_ref(table_id, attr_base_index) - }) - .collect(), - None, - ), - ); - MockMemoExtImpl::from(HashMap::from([(TEST_GROUP1_ID, group_info)])) - } - - /// Create a cost model two tables, each with one attribute. Each attribute has 100 values. - pub fn create_two_table_mock_cost_model( - tbl1_per_attr_stats: TestPerAttributeStats, - tbl2_per_attr_stats: TestPerAttributeStats, - additional_memo: Option>, - ) -> TestOptCostModelMock { - create_two_table_mock_cost_model_custom_row_cnts( - tbl1_per_attr_stats, - tbl2_per_attr_stats, - 100, - 100, - additional_memo, - ) - } - - /// Create a cost model three tables, each with one attribute. Each attribute has 100 values. - pub fn create_three_table_mock_cost_model( - tbl1_per_column_stats: TestPerAttributeStats, - tbl2_per_column_stats: TestPerAttributeStats, - tbl3_per_column_stats: TestPerAttributeStats, - ) -> TestOptCostModelMock { - let storage_manager = CostModelStorageMockManagerImpl::new( - vec![ - ( - TEST_TABLE1_ID, - TableStats::new( - 100, - vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), - ), - ), - ( - TEST_TABLE2_ID, - TableStats::new( - 100, - vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), - ), - ), - ( - TEST_TABLE3_ID, - TableStats::new( - 100, - vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(), - ), - ), - ] - .into_iter() - .collect(), - ); - let memo = HashMap::from([ - ( - TEST_GROUP1_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], - None, - ), - ), - ), - ( - TEST_GROUP2_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], - None, - ), - ), - ), - ( - TEST_GROUP3_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR3_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE3_ID, 0)], - None, - ), - ), - ), - ]); - CostModelImpl::new( - storage_manager, - CatalogSource::Mock, - Arc::new(MockMemoExtImpl::from(memo)), - ) - } - - /// Create a cost model four tables, each with one attribute. Each attribute has 100 values. - pub fn create_four_table_mock_cost_model( - tbl1_per_column_stats: TestPerAttributeStats, - tbl2_per_column_stats: TestPerAttributeStats, - tbl3_per_column_stats: TestPerAttributeStats, - tbl4_per_column_stats: TestPerAttributeStats, - ) -> TestOptCostModelMock { - let storage_manager = CostModelStorageMockManagerImpl::new( - vec![ - ( - TEST_TABLE1_ID, - TableStats::new( - 100, - vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), - ), - ), - ( - TEST_TABLE2_ID, - TableStats::new( - 100, - vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), - ), - ), - ( - TEST_TABLE3_ID, - TableStats::new( - 100, - vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(), - ), - ), - ( - TEST_TABLE4_ID, - TableStats::new( - 100, - vec![(vec![0], tbl4_per_column_stats)].into_iter().collect(), - ), - ), - ] - .into_iter() - .collect(), - ); - let memo = HashMap::from([ - ( - TEST_GROUP1_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], - None, - ), - ), - ), - ( - TEST_GROUP2_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], - None, - ), - ), - ), - ( - TEST_GROUP3_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR3_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE3_ID, 0)], - None, - ), - ), - ), - ( - TEST_GROUP4_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR4_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE4_ID, 0)], - None, - ), - ), - ), - ]); - CostModelImpl::new( - storage_manager, - CatalogSource::Mock, - Arc::new(MockMemoExtImpl::from(memo)), - ) - } - - /// We need custom row counts because some join algorithms rely on the row cnt - pub fn create_two_table_mock_cost_model_custom_row_cnts( - tbl1_per_column_stats: TestPerAttributeStats, - tbl2_per_column_stats: TestPerAttributeStats, - tbl1_row_cnt: u64, - tbl2_row_cnt: u64, - additional_memo: Option>, - ) -> TestOptCostModelMock { - let storage_manager = CostModelStorageMockManagerImpl::new( - vec![ - ( - TEST_TABLE1_ID, - TableStats::new( - tbl1_row_cnt, - vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), - ), - ), - ( - TEST_TABLE2_ID, - TableStats::new( - tbl2_row_cnt, - vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), - ), - ), - ] - .into_iter() - .collect(), - ); - let mut memo = HashMap::from([ - ( - TEST_GROUP1_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], - None, - ), - ), - ), - ( - TEST_GROUP2_ID, - MemoGroupInfo::new( - vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), - GroupAttrRefs::new( - vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], - None, - ), - ), - ), - ]); - if let Some(additional_memo) = additional_memo { - memo.extend(additional_memo); - } - CostModelImpl::new( - storage_manager, - CatalogSource::Mock, - Arc::new(MockMemoExtImpl::from(memo)), - ) - } - - impl TestOptCostModelMock { - pub fn get_row_count(&self, table_id: TableId) -> u64 { - self.storage_manager - .per_table_stats_map - .get(&table_id) - .map(|stats| stats.row_cnt) - .unwrap_or(0) - } - - pub fn get_attr_refs(&self, group_id: GroupId) -> GroupAttrRefs { - self.memo.get_attribute_refs(group_id) - } - } - - pub fn attr_index(attr_index: u64) -> ArcPredicateNode { - AttrIndexPred::new(attr_index).into_pred_node() - } - - pub fn cnst(value: Value) -> ArcPredicateNode { - ConstantPred::new(value).into_pred_node() - } - - pub fn cast(child: ArcPredicateNode, cast_type: DataType) -> ArcPredicateNode { - CastPred::new(child, cast_type).into_pred_node() - } - - pub fn bin_op( - op_type: BinOpType, - left: ArcPredicateNode, - right: ArcPredicateNode, - ) -> ArcPredicateNode { - BinOpPred::new(left, right, op_type).into_pred_node() - } - - pub fn log_op(op_type: LogOpType, children: Vec) -> ArcPredicateNode { - LogOpPred::new(op_type, children).into_pred_node() - } - - pub fn un_op(op_type: UnOpType, child: ArcPredicateNode) -> ArcPredicateNode { - UnOpPred::new(child, op_type).into_pred_node() - } - - pub fn empty_list() -> ArcPredicateNode { - ListPred::new(vec![]).into_pred_node() - } - - pub fn list(children: Vec) -> ArcPredicateNode { - ListPred::new(children).into_pred_node() - } - - pub fn in_list(attr_idx: u64, list: Vec, negated: bool) -> InListPred { - InListPred::new( - attr_index(attr_idx), - ListPred::new(list.into_iter().map(cnst).collect_vec()), - negated, - ) - } - - pub fn like(attr_idx: u64, pattern: &str, negated: bool) -> LikePred { - LikePred::new( - negated, - false, - attr_index(attr_idx), - cnst(Value::String(pattern.into())), - ) - } - - pub(crate) fn empty_per_attr_stats() -> TestPerAttributeStats { - TestPerAttributeStats::new( - MostCommonValues::empty(), - Some(Distribution::empty()), - 0, - 0.0, - ) - } - - pub(crate) fn per_attr_stats_with_ndistinct(ndistinct: u64) -> TestPerAttributeStats { - TestPerAttributeStats::new( - MostCommonValues::empty(), - Some(Distribution::empty()), - ndistinct, - 0.0, - ) - } - - pub(crate) fn per_attr_stats_with_dist_and_ndistinct( - dist: Vec<(Value, f64)>, - ndistinct: u64, - ) -> TestPerAttributeStats { - TestPerAttributeStats::new( - MostCommonValues::empty(), - Some(Distribution::SimpleDistribution(SimpleMap::new(dist))), - ndistinct, - 0.0, - ) - } -} diff --git a/optd-cost-model/src/lib.rs b/optd-cost-model/src/lib.rs index 0b19737..68b56ac 100644 --- a/optd-cost-model/src/lib.rs +++ b/optd-cost-model/src/lib.rs @@ -13,6 +13,7 @@ pub mod cost_model; pub mod memo_ext; pub mod stats; pub mod storage; +pub mod test_utils; pub mod utils; pub enum StatValue { diff --git a/optd-cost-model/src/memo_ext.rs b/optd-cost-model/src/memo_ext.rs index c7827c5..78d4225 100644 --- a/optd-cost-model/src/memo_ext.rs +++ b/optd-cost-model/src/memo_ext.rs @@ -26,69 +26,3 @@ pub trait MemoExt: Send + Sync + 'static { // TODO: Figure out what other information is needed to compute the cost... } - -#[cfg(test)] -pub mod tests { - use std::collections::HashMap; - - use crate::common::{ - properties::{ - attr_ref::{AttrRef, GroupAttrRefs}, - schema::Schema, - Attribute, - }, - types::GroupId, - }; - - pub struct MemoGroupInfo { - pub schema: Schema, - pub attr_refs: GroupAttrRefs, - } - - impl MemoGroupInfo { - pub fn new(schema: Schema, attr_refs: GroupAttrRefs) -> Self { - Self { schema, attr_refs } - } - } - - #[derive(Default)] - pub struct MockMemoExtImpl { - memo: HashMap, - } - - impl MockMemoExtImpl { - pub fn add_group_info( - &mut self, - group_id: GroupId, - schema: Schema, - attr_ref: GroupAttrRefs, - ) { - self.memo - .insert(group_id, MemoGroupInfo::new(schema, attr_ref)); - } - } - - impl super::MemoExt for MockMemoExtImpl { - fn get_schema(&self, group_id: GroupId) -> Schema { - self.memo.get(&group_id).unwrap().schema.clone() - } - - fn get_attribute_info(&self, group_id: GroupId, attr_ref_idx: u64) -> Attribute { - self.memo.get(&group_id).unwrap().schema.attributes[attr_ref_idx as usize].clone() - } - - fn get_attribute_refs(&self, group_id: GroupId) -> GroupAttrRefs { - self.memo.get(&group_id).unwrap().attr_refs.clone() - } - - fn get_attribute_ref(&self, group_id: GroupId, attr_ref_idx: u64) -> AttrRef { - self.memo.get(&group_id).unwrap().attr_refs.attr_refs()[attr_ref_idx as usize].clone() - } - } - - impl From> for MockMemoExtImpl { - fn from(memo: HashMap) -> Self { - Self { memo } - } - } -} diff --git a/optd-cost-model/src/storage/persistent.rs b/optd-cost-model/src/storage/persistent.rs index e9459f5..2238507 100644 --- a/optd-cost-model/src/storage/persistent.rs +++ b/optd-cost-model/src/storage/persistent.rs @@ -121,7 +121,7 @@ impl CostModelStorageManager .backend_manager .get_stats_for_table(table_id.into(), StatType::TableRowCount, None) .await? - .map(|json| serde_json::from_value(json)) + .map(serde_json::from_value) .transpose()?) } diff --git a/optd-cost-model/src/test_utils.rs b/optd-cost-model/src/test_utils.rs new file mode 100644 index 0000000..60db90f --- /dev/null +++ b/optd-cost-model/src/test_utils.rs @@ -0,0 +1,565 @@ +/// I thought about using the system's own parser and planner to generate these expression trees, +/// but this is not currently feasible because it would create a cyclic dependency between +/// optd-datafusion-bridge and optd-datafusion-repr +#[cfg(test)] +pub mod tests { + use itertools::Itertools; + use std::{collections::HashMap, sync::Arc}; + + use arrow_schema::DataType; + use optd_persistent::cost_model::interface::CatalogSource; + + use crate::{ + common::{ + nodes::{ArcPredicateNode, ReprPredicateNode}, + predicates::{ + attr_index_pred::AttrIndexPred, + bin_op_pred::{BinOpPred, BinOpType}, + cast_pred::CastPred, + constant_pred::{ConstantPred, ConstantType}, + in_list_pred::InListPred, + like_pred::LikePred, + list_pred::ListPred, + log_op_pred::{LogOpPred, LogOpType}, + un_op_pred::{UnOpPred, UnOpType}, + }, + properties::{ + attr_ref::{AttrRef, GroupAttrRefs}, + schema::Schema, + Attribute, + }, + types::{GroupId, TableId}, + values::Value, + }, + cost_model::CostModelImpl, + memo_ext::MemoExt, + stats::{ + utilities::simple_map::SimpleMap, AttributeCombValueStats, Distribution, + MostCommonValues, + }, + storage::mock::{CostModelStorageMockManagerImpl, TableStats}, + }; + + pub struct MemoGroupInfo { + pub schema: Schema, + pub attr_refs: GroupAttrRefs, + } + + impl MemoGroupInfo { + pub fn new(schema: Schema, attr_refs: GroupAttrRefs) -> Self { + Self { schema, attr_refs } + } + } + + #[derive(Default)] + pub struct MockMemoExtImpl { + memo: HashMap, + } + + impl MockMemoExtImpl { + pub fn add_group_info( + &mut self, + group_id: GroupId, + schema: Schema, + attr_ref: GroupAttrRefs, + ) { + self.memo + .insert(group_id, MemoGroupInfo::new(schema, attr_ref)); + } + } + + impl MemoExt for MockMemoExtImpl { + fn get_schema(&self, group_id: GroupId) -> Schema { + self.memo.get(&group_id).unwrap().schema.clone() + } + + fn get_attribute_info(&self, group_id: GroupId, attr_ref_idx: u64) -> Attribute { + self.memo.get(&group_id).unwrap().schema.attributes[attr_ref_idx as usize].clone() + } + + fn get_attribute_refs(&self, group_id: GroupId) -> GroupAttrRefs { + self.memo.get(&group_id).unwrap().attr_refs.clone() + } + + fn get_attribute_ref(&self, group_id: GroupId, attr_ref_idx: u64) -> AttrRef { + self.memo.get(&group_id).unwrap().attr_refs.attr_refs()[attr_ref_idx as usize].clone() + } + } + + impl From> for MockMemoExtImpl { + fn from(memo: HashMap) -> Self { + Self { memo } + } + } + + pub const TEST_TABLE1_ID: TableId = TableId(0); + pub const TEST_TABLE2_ID: TableId = TableId(1); + pub const TEST_TABLE3_ID: TableId = TableId(2); + pub const TEST_TABLE4_ID: TableId = TableId(3); + + pub const TEST_GROUP1_ID: GroupId = GroupId(0); + pub const TEST_GROUP2_ID: GroupId = GroupId(1); + pub const TEST_GROUP3_ID: GroupId = GroupId(2); + pub const TEST_GROUP4_ID: GroupId = GroupId(3); + + // This is base index rather than ref index. + pub const TEST_ATTR1_BASE_INDEX: u64 = 0; + pub const TEST_ATTR2_BASE_INDEX: u64 = 1; + pub const TEST_ATTR3_BASE_INDEX: u64 = 2; + + pub const TEST_ATTR1_NAME: &str = "attr1"; + pub const TEST_ATTR2_NAME: &str = "attr2"; + pub const TEST_ATTR3_NAME: &str = "attr3"; + pub const TEST_ATTR4_NAME: &str = "attr4"; + + pub type TestPerAttributeStats = AttributeCombValueStats; + // TODO: add tests for non-mock storage manager + pub type TestOptCostModelMock = CostModelImpl; + + // Use this method, we only create one group `TEST_GROUP1_ID` in the memo. + // We put the first attribute in the first table as the ref index 0 in the group. + // And put the second attribute in the first table as the ref index 1 in the group. + // etc. + // The orders of attributes and tables are defined by the order of their ids (smaller first). + pub fn create_mock_cost_model( + table_id: Vec, + // u64 should be base attribute index. + per_attribute_stats: Vec>, + row_counts: Vec>, + ) -> TestOptCostModelMock { + let attr_ids: Vec<(TableId, u64, Option)> = per_attribute_stats + .iter() + .enumerate() + .map(|(idx, m)| (table_id[idx], m)) + .flat_map(|(table_id, m)| { + m.iter() + .map(|(attr_idx, _)| (table_id, *attr_idx, None)) + .collect_vec() + }) + .sorted_by_key(|(table_id, attr_idx, _)| (*table_id, *attr_idx)) + .collect(); + create_mock_cost_model_with_memo( + table_id.clone(), + per_attribute_stats, + row_counts, + create_one_group_all_base_attributes_mock_memo(attr_ids), + ) + } + + pub fn create_mock_cost_model_with_attr_types( + table_id: Vec, + // u64 should be base attribute index. + per_attribute_stats: Vec>, + attributes: Vec>, + row_counts: Vec>, + ) -> TestOptCostModelMock { + let attr_ids: Vec<(TableId, u64, Option)> = attributes + .iter() + .enumerate() + .map(|(idx, m)| (table_id[idx], m)) + .flat_map(|(table_id, m)| { + m.iter() + .map(|(attr_idx, typ)| (table_id, *attr_idx, Some(*typ))) + .collect_vec() + }) + .sorted_by_key(|(table_id, attr_idx, _)| (*table_id, *attr_idx)) + .collect(); + create_mock_cost_model_with_memo( + table_id.clone(), + per_attribute_stats, + row_counts, + create_one_group_all_base_attributes_mock_memo(attr_ids), + ) + } + + pub fn create_mock_cost_model_with_memo( + table_id: Vec, + per_attribute_stats: Vec>, + row_counts: Vec>, + memo: MockMemoExtImpl, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + table_id + .into_iter() + .zip(per_attribute_stats) + .zip(row_counts) + .map(|((table_id, per_attr_stats), row_count)| { + ( + table_id, + TableStats::new( + row_count.unwrap_or(100), + per_attr_stats + .into_iter() + .map(|(attr_idx, stats)| (vec![attr_idx], stats)) + .collect(), + ), + ) + }) + .collect(), + ); + CostModelImpl::new(storage_manager, CatalogSource::Mock, Arc::new(memo)) + } + + // attributes: Vec<(TableId, AttrBaseIndex)> + pub fn create_one_group_all_base_attributes_mock_memo( + attr_ids: Vec<(TableId, u64, Option)>, + ) -> MockMemoExtImpl { + let group_info = MemoGroupInfo::new( + Schema::new( + attr_ids + .clone() + .into_iter() + .map(|(_, _, typ)| Attribute { + name: "attr".to_string(), + typ: typ.unwrap_or(ConstantType::Int64), + nullable: false, + }) + .collect(), + ), + GroupAttrRefs::new( + attr_ids + .into_iter() + .map(|(table_id, attr_base_index, _)| { + AttrRef::new_base_table_attr_ref(table_id, attr_base_index) + }) + .collect(), + None, + ), + ); + MockMemoExtImpl::from(HashMap::from([(TEST_GROUP1_ID, group_info)])) + } + + /// Create a cost model two tables, each with one attribute. Each attribute has 100 values. + pub fn create_two_table_mock_cost_model( + tbl1_per_attr_stats: TestPerAttributeStats, + tbl2_per_attr_stats: TestPerAttributeStats, + additional_memo: Option>, + ) -> TestOptCostModelMock { + create_two_table_mock_cost_model_custom_row_cnts( + tbl1_per_attr_stats, + tbl2_per_attr_stats, + 100, + 100, + additional_memo, + ) + } + + /// Create a cost model three tables, each with one attribute. Each attribute has 100 values. + pub fn create_three_table_mock_cost_model( + tbl1_per_column_stats: TestPerAttributeStats, + tbl2_per_column_stats: TestPerAttributeStats, + tbl3_per_column_stats: TestPerAttributeStats, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + vec![ + ( + TEST_TABLE1_ID, + TableStats::new( + 100, + vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE2_ID, + TableStats::new( + 100, + vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE3_ID, + TableStats::new( + 100, + vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(), + ), + ), + ] + .into_iter() + .collect(), + ); + let memo = HashMap::from([ + ( + TEST_GROUP1_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP2_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP3_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR3_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE3_ID, 0)], + None, + ), + ), + ), + ]); + CostModelImpl::new( + storage_manager, + CatalogSource::Mock, + Arc::new(MockMemoExtImpl::from(memo)), + ) + } + + /// Create a cost model four tables, each with one attribute. Each attribute has 100 values. + pub fn create_four_table_mock_cost_model( + tbl1_per_column_stats: TestPerAttributeStats, + tbl2_per_column_stats: TestPerAttributeStats, + tbl3_per_column_stats: TestPerAttributeStats, + tbl4_per_column_stats: TestPerAttributeStats, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + vec![ + ( + TEST_TABLE1_ID, + TableStats::new( + 100, + vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE2_ID, + TableStats::new( + 100, + vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE3_ID, + TableStats::new( + 100, + vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE4_ID, + TableStats::new( + 100, + vec![(vec![0], tbl4_per_column_stats)].into_iter().collect(), + ), + ), + ] + .into_iter() + .collect(), + ); + let memo = HashMap::from([ + ( + TEST_GROUP1_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP2_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP3_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR3_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE3_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP4_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR4_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE4_ID, 0)], + None, + ), + ), + ), + ]); + CostModelImpl::new( + storage_manager, + CatalogSource::Mock, + Arc::new(MockMemoExtImpl::from(memo)), + ) + } + + /// We need custom row counts because some join algorithms rely on the row cnt + pub fn create_two_table_mock_cost_model_custom_row_cnts( + tbl1_per_column_stats: TestPerAttributeStats, + tbl2_per_column_stats: TestPerAttributeStats, + tbl1_row_cnt: u64, + tbl2_row_cnt: u64, + additional_memo: Option>, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + vec![ + ( + TEST_TABLE1_ID, + TableStats::new( + tbl1_row_cnt, + vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE2_ID, + TableStats::new( + tbl2_row_cnt, + vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), + ), + ), + ] + .into_iter() + .collect(), + ); + let mut memo = HashMap::from([ + ( + TEST_GROUP1_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP2_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], + None, + ), + ), + ), + ]); + if let Some(additional_memo) = additional_memo { + memo.extend(additional_memo); + } + CostModelImpl::new( + storage_manager, + CatalogSource::Mock, + Arc::new(MockMemoExtImpl::from(memo)), + ) + } + + impl TestOptCostModelMock { + pub fn get_row_count(&self, table_id: TableId) -> u64 { + self.storage_manager + .per_table_stats_map + .get(&table_id) + .map(|stats| stats.row_cnt) + .unwrap_or(0) + } + + pub fn get_attr_refs(&self, group_id: GroupId) -> GroupAttrRefs { + self.memo.get_attribute_refs(group_id) + } + } + + pub fn attr_index(attr_index: u64) -> ArcPredicateNode { + AttrIndexPred::new(attr_index).into_pred_node() + } + + pub fn cnst(value: Value) -> ArcPredicateNode { + ConstantPred::new(value).into_pred_node() + } + + pub fn cast(child: ArcPredicateNode, cast_type: DataType) -> ArcPredicateNode { + CastPred::new(child, cast_type).into_pred_node() + } + + pub fn bin_op( + op_type: BinOpType, + left: ArcPredicateNode, + right: ArcPredicateNode, + ) -> ArcPredicateNode { + BinOpPred::new(left, right, op_type).into_pred_node() + } + + pub fn log_op(op_type: LogOpType, children: Vec) -> ArcPredicateNode { + LogOpPred::new(op_type, children).into_pred_node() + } + + pub fn un_op(op_type: UnOpType, child: ArcPredicateNode) -> ArcPredicateNode { + UnOpPred::new(child, op_type).into_pred_node() + } + + pub fn empty_list() -> ArcPredicateNode { + ListPred::new(vec![]).into_pred_node() + } + + pub fn list(children: Vec) -> ArcPredicateNode { + ListPred::new(children).into_pred_node() + } + + pub fn in_list(attr_idx: u64, list: Vec, negated: bool) -> InListPred { + InListPred::new( + attr_index(attr_idx), + ListPred::new(list.into_iter().map(cnst).collect_vec()), + negated, + ) + } + + pub fn like(attr_idx: u64, pattern: &str, negated: bool) -> LikePred { + LikePred::new( + negated, + false, + attr_index(attr_idx), + cnst(Value::String(pattern.into())), + ) + } + + pub(crate) fn empty_per_attr_stats() -> TestPerAttributeStats { + TestPerAttributeStats::new( + MostCommonValues::empty(), + Some(Distribution::empty()), + 0, + 0.0, + ) + } + + pub(crate) fn per_attr_stats_with_ndistinct(ndistinct: u64) -> TestPerAttributeStats { + TestPerAttributeStats::new( + MostCommonValues::empty(), + Some(Distribution::empty()), + ndistinct, + 0.0, + ) + } + + pub(crate) fn per_attr_stats_with_dist_and_ndistinct( + dist: Vec<(Value, f64)>, + ndistinct: u64, + ) -> TestPerAttributeStats { + TestPerAttributeStats::new( + MostCommonValues::empty(), + Some(Distribution::SimpleDistribution(SimpleMap::new(dist))), + ndistinct, + 0.0, + ) + } +}