Skip to content

Commit efd01f8

Browse files
lanlou1554xx01cyx
andauthored
feat(cost-model): Basic cost model computation (#40)
* add basic cost computation --------- Co-authored-by: Yuanxin Cao <[email protected]>
1 parent db9dbbe commit efd01f8

File tree

22 files changed

+2431
-102
lines changed

22 files changed

+2431
-102
lines changed

Cargo.lock

Lines changed: 705 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-cost-model/Cargo.lock

Lines changed: 656 additions & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-cost-model/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
name = "optd-cost-model"
33
version = "0.1.0"
44
edition = "2021"
5+
authors = ["Yuanxin Cao", "Lan Lou", "Kunle Li"]
56

67
[dependencies]
78
optd-persistent = { path = "../optd-persistent", version = "0.1" }
@@ -10,10 +11,15 @@ serde_json = "1.0"
1011
serde_with = { version = "3.7.0", features = ["json"] }
1112
arrow-schema = "53.2.0"
1213
datafusion-expr = "32.0.0"
14+
datafusion = "32.0.0"
1315
ordered-float = "4.0"
1416
chrono = "0.4"
1517
itertools = "0.13"
18+
assert_approx_eq = "1.1.0"
19+
trait-variant = "0.1.2"
20+
tokio = { version = "1.0.1", features = ["macros", "rt-multi-thread"] }
1621

1722
[dev-dependencies]
1823
crossbeam = "0.8"
1924
rand = "0.8"
25+
test-case = "3.3"

optd-cost-model/src/cost/agg.rs

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,203 @@
1+
use crate::{
2+
common::{
3+
nodes::{ArcPredicateNode, PredicateType, ReprPredicateNode},
4+
predicates::{attr_index_pred::AttrIndexPred, list_pred::ListPred},
5+
properties::attr_ref::{AttrRef, BaseTableAttrRef},
6+
types::GroupId,
7+
},
8+
cost_model::CostModelImpl,
9+
stats::DEFAULT_NUM_DISTINCT,
10+
storage::CostModelStorageManager,
11+
CostModelError, CostModelResult, EstimatedStatistic, SemanticError,
12+
};
113

14+
impl<S: CostModelStorageManager> CostModelImpl<S> {
15+
pub async fn get_agg_row_cnt(
16+
&self,
17+
group_id: GroupId,
18+
group_by: ArcPredicateNode,
19+
) -> CostModelResult<EstimatedStatistic> {
20+
let group_by = ListPred::from_pred_node(group_by).unwrap();
21+
if group_by.is_empty() {
22+
Ok(EstimatedStatistic(1.0))
23+
} else {
24+
// Multiply the n-distinct of all the group by columns.
25+
// TODO: improve with multi-dimensional n-distinct
26+
let mut row_cnt = 1;
27+
28+
for node in &group_by.0.children {
29+
match node.typ {
30+
PredicateType::AttrIndex => {
31+
let attr_ref =
32+
AttrIndexPred::from_pred_node(node.clone()).ok_or_else(|| {
33+
SemanticError::InvalidPredicate(
34+
"Expected AttributeRef predicate".to_string(),
35+
)
36+
})?;
37+
if let AttrRef::BaseTableAttrRef(BaseTableAttrRef { table_id, attr_idx }) =
38+
self.memo.get_attribute_ref(group_id, attr_ref.attr_index())
39+
{
40+
// TODO: Only query ndistinct instead of all kinds of stats.
41+
let stats_option =
42+
self.get_attribute_comb_stats(table_id, &[attr_idx]).await?;
43+
44+
let ndistinct = match stats_option {
45+
Some(stats) => stats.ndistinct,
46+
None => {
47+
// The column type is not supported or stats are missing.
48+
DEFAULT_NUM_DISTINCT
49+
}
50+
};
51+
row_cnt *= ndistinct;
52+
} else {
53+
// TOOD: Handle derived attributes.
54+
row_cnt *= DEFAULT_NUM_DISTINCT;
55+
}
56+
}
57+
_ => {
58+
// TODO: Consider the case where `GROUP BY 1`.
59+
panic!("GROUP BY must have attribute ref predicate");
60+
}
61+
}
62+
}
63+
Ok(EstimatedStatistic(row_cnt as f64))
64+
}
65+
}
66+
}
67+
68+
#[cfg(test)]
69+
mod tests {
70+
use std::{collections::HashMap, ops::Deref};
71+
72+
use crate::{
73+
common::{
74+
predicates::constant_pred::ConstantType,
75+
properties::Attribute,
76+
types::{GroupId, TableId},
77+
values::Value,
78+
},
79+
cost_model::tests::{
80+
attr_index, cnst, create_mock_cost_model, create_mock_cost_model_with_attr_types,
81+
empty_list, empty_per_attr_stats, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX,
82+
TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID,
83+
},
84+
stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT},
85+
EstimatedStatistic,
86+
};
87+
88+
#[tokio::test]
89+
async fn test_agg_no_stats() {
90+
let cost_model = create_mock_cost_model_with_attr_types(
91+
vec![TEST_TABLE1_ID],
92+
vec![],
93+
vec![HashMap::from([
94+
(TEST_ATTR1_BASE_INDEX, ConstantType::Int32),
95+
(TEST_ATTR2_BASE_INDEX, ConstantType::Int32),
96+
])],
97+
vec![None],
98+
);
99+
100+
// Group by empty list should return 1.
101+
let group_bys = empty_list();
102+
assert_eq!(
103+
cost_model
104+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
105+
.await
106+
.unwrap(),
107+
EstimatedStatistic(1.0)
108+
);
109+
110+
// Group by single column should return the default value since there are no stats.
111+
let group_bys = list(vec![attr_index(0)]);
112+
assert_eq!(
113+
cost_model
114+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
115+
.await
116+
.unwrap(),
117+
EstimatedStatistic(DEFAULT_NUM_DISTINCT as f64)
118+
);
119+
120+
// Group by two columns should return the default value squared since there are no stats.
121+
let group_bys = list(vec![attr_index(0), attr_index(1)]);
122+
assert_eq!(
123+
cost_model
124+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
125+
.await
126+
.unwrap(),
127+
EstimatedStatistic((DEFAULT_NUM_DISTINCT * DEFAULT_NUM_DISTINCT) as f64)
128+
);
129+
}
130+
131+
#[tokio::test]
132+
async fn test_agg_with_stats() {
133+
let attr1_ndistinct = 12;
134+
let attr2_ndistinct = 645;
135+
let attr1_stats = TestPerAttributeStats::new(
136+
MostCommonValues::SimpleFrequency(SimpleMap::default()),
137+
None,
138+
attr1_ndistinct,
139+
0.0,
140+
);
141+
let attr2_stats = TestPerAttributeStats::new(
142+
MostCommonValues::SimpleFrequency(SimpleMap::default()),
143+
None,
144+
attr2_ndistinct,
145+
0.0,
146+
);
147+
148+
let cost_model = create_mock_cost_model_with_attr_types(
149+
vec![TEST_TABLE1_ID],
150+
vec![HashMap::from([
151+
(TEST_ATTR1_BASE_INDEX, attr1_stats),
152+
(TEST_ATTR2_BASE_INDEX, attr2_stats),
153+
])],
154+
vec![HashMap::from([
155+
(TEST_ATTR1_BASE_INDEX, ConstantType::Int32),
156+
(TEST_ATTR2_BASE_INDEX, ConstantType::Int32),
157+
(TEST_ATTR3_BASE_INDEX, ConstantType::Int32),
158+
])],
159+
vec![None],
160+
);
161+
162+
// Group by empty list should return 1.
163+
let group_bys = empty_list();
164+
assert_eq!(
165+
cost_model
166+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
167+
.await
168+
.unwrap(),
169+
EstimatedStatistic(1.0)
170+
);
171+
172+
// Group by single column should return the n-distinct of the column.
173+
let group_bys = list(vec![attr_index(0)]);
174+
assert_eq!(
175+
cost_model
176+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
177+
.await
178+
.unwrap(),
179+
EstimatedStatistic(attr1_ndistinct as f64)
180+
);
181+
182+
// Group by two columns should return the product of the n-distinct of the columns.
183+
let group_bys = list(vec![attr_index(0), attr_index(1)]);
184+
assert_eq!(
185+
cost_model
186+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
187+
.await
188+
.unwrap(),
189+
EstimatedStatistic((attr1_ndistinct * attr2_ndistinct) as f64)
190+
);
191+
192+
// Group by multiple columns should return the product of the n-distinct of the columns. If one of the columns
193+
// does not have stats, it should use the default value instead.
194+
let group_bys = list(vec![attr_index(0), attr_index(1), attr_index(2)]);
195+
assert_eq!(
196+
cost_model
197+
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
198+
.await
199+
.unwrap(),
200+
EstimatedStatistic((attr1_ndistinct * attr2_ndistinct * DEFAULT_NUM_DISTINCT) as f64)
201+
);
202+
}
203+
}

optd-cost-model/src/cost/filter.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

optd-cost-model/src/cost/join.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

optd-cost-model/src/cost/limit.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
use crate::{
2+
common::{
3+
nodes::{ArcPredicateNode, ReprPredicateNode},
4+
predicates::constant_pred::ConstantPred,
5+
},
6+
cost_model::CostModelImpl,
7+
storage::CostModelStorageManager,
8+
CostModelResult, EstimatedStatistic,
9+
};
10+
11+
impl<S: CostModelStorageManager> CostModelImpl<S> {
12+
pub(crate) fn get_limit_row_cnt(
13+
&self,
14+
child_row_cnt: EstimatedStatistic,
15+
fetch_expr: ArcPredicateNode,
16+
) -> CostModelResult<EstimatedStatistic> {
17+
let fetch = ConstantPred::from_pred_node(fetch_expr)
18+
.unwrap()
19+
.value()
20+
.as_u64();
21+
// u64::MAX represents None
22+
if fetch == u64::MAX {
23+
Ok(child_row_cnt)
24+
} else {
25+
Ok(EstimatedStatistic(child_row_cnt.0.min(fetch as f64)))
26+
}
27+
}
28+
}

optd-cost-model/src/cost/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#![allow(unused)]
2+
13
pub mod agg;
24
pub mod filter;
35
pub mod join;
6+
pub mod limit;

0 commit comments

Comments
 (0)