Skip to content

Commit 2015d97

Browse files
committed
Add get_attribute_comb_stats
1 parent 73244e9 commit 2015d97

File tree

5 files changed

+93
-68
lines changed

5 files changed

+93
-68
lines changed

optd-cost-model/src/common/nodes.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ pub struct PredicateNode {
7979
/// Child predicate nodes, always materialized
8080
pub children: Vec<ArcPredicateNode>,
8181
/// Data associated with the predicate, if any
82+
/// TODO: If it is PredicateType::AttributeRef, then
83+
/// the data is attribute index. But we need more information
84+
/// to represent this attribute in case it is a derived attribute.
8285
pub data: Option<Value>,
8386
}
8487

optd-cost-model/src/cost/filter.rs

Lines changed: 56 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -263,43 +263,36 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
263263
is_eq: bool,
264264
) -> CostModelResult<f64> {
265265
// TODO: The attribute could be a derived attribute
266-
todo!()
267-
// let ret_sel = if let Some(attribute_stats) =
268-
// self.get_attribute_comb_stats(table_id, &[attr_base_index])
269-
// {
270-
// let eq_freq = if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())]) {
271-
// freq
272-
// } else {
273-
// let non_mcv_freq = 1.0 - attribute_stats.mcvs.total_freq();
274-
// // always safe because usize is at least as large as i32
275-
// let ndistinct_as_usize = attribute_stats.ndistinct as usize;
276-
// let non_mcv_cnt = ndistinct_as_usize - attribute_stats.mcvs.cnt();
277-
// if non_mcv_cnt == 0 {
278-
// return 0.0;
279-
// }
280-
// // note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt
281-
// // - 1 if null_frac > 0
282-
// (non_mcv_freq - attribute_stats.null_frac) / (non_mcv_cnt as f64)
283-
// };
284-
// if is_eq {
285-
// eq_freq
286-
// } else {
287-
// 1.0 - eq_freq - attribute_stats.null_frac
288-
// }
289-
// } else {
290-
// #[allow(clippy::collapsible_else_if)]
291-
// if is_eq {
292-
// DEFAULT_EQ_SEL
293-
// } else {
294-
// 1.0 - DEFAULT_EQ_SEL
295-
// }
296-
// };
297-
// assert!(
298-
// (0.0..=1.0).contains(&ret_sel),
299-
// "ret_sel ({}) should be in [0, 1]",
300-
// ret_sel
301-
// );
302-
// ret_sel
266+
let ret_sel = {
267+
let attribute_stats = self.get_attribute_comb_stats(table_id, &[attr_base_index])?;
268+
let eq_freq = if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())])
269+
{
270+
freq
271+
} else {
272+
let non_mcv_freq = 1.0 - attribute_stats.mcvs.total_freq();
273+
// always safe because usize is at least as large as i32
274+
let ndistinct_as_usize = attribute_stats.ndistinct as usize;
275+
let non_mcv_cnt = ndistinct_as_usize - attribute_stats.mcvs.cnt();
276+
if non_mcv_cnt == 0 {
277+
return Ok(0.0);
278+
}
279+
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt
280+
// - 1 if null_frac > 0
281+
(non_mcv_freq - attribute_stats.null_frac) / (non_mcv_cnt as f64)
282+
};
283+
if is_eq {
284+
eq_freq
285+
} else {
286+
1.0 - eq_freq - attribute_stats.null_frac
287+
}
288+
};
289+
290+
assert!(
291+
(0.0..=1.0).contains(&ret_sel),
292+
"ret_sel ({}) should be in [0, 1]",
293+
ret_sel
294+
);
295+
Ok(ret_sel)
303296
}
304297

305298
/// Get the selectivity of an expression of the form "attribute </<=/>=/> value" (or "value
@@ -315,33 +308,33 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
315308
end: Bound<&Value>,
316309
) -> CostModelResult<f64> {
317310
// TODO: Consider attribute is a derived attribute
311+
let attribute_stats = self.get_attribute_comb_stats(table_id, &[attr_base_index])?;
318312
todo!()
319-
// if let Some(attribute_stats) = self.get_attribute_comb_stats(table, &[attr_idx]) {
320-
// // Left and right quantile contain both Distribution and MCVs.
321-
// let left_quantile = match start {
322-
// Bound::Unbounded => 0.0,
323-
// Bound::Included(value) => {
324-
// self.get_attribute_lt_value_freq(attribute_stats, table, attr_idx, value)
325-
// }
326-
// Bound::Excluded(value) => Self::get_attribute_leq_value_freq(attribute_stats, value),
327-
// };
328-
// let right_quantile = match end {
329-
// Bound::Unbounded => 1.0,
330-
// Bound::Included(value) => Self::get_attribute_leq_value_freq(attribute_stats, value),
331-
// Bound::Excluded(value) => {
332-
// self.get_attribute_lt_value_freq(attribute_stats, table, attr_idx, value)
333-
// }
334-
// };
335-
// assert!(
336-
// left_quantile <= right_quantile,
337-
// "left_quantile ({}) should be <= right_quantile ({})",
338-
// left_quantile,
339-
// right_quantile
340-
// );
341-
// right_quantile - left_quantile
342-
// } else {
343-
// DEFAULT_INEQ_SEL
344-
// }
313+
// let left_quantile = match start {
314+
// Bound::Unbounded => 0.0,
315+
// Bound::Included(value) => {
316+
// self.get_attribute_lt_value_freq(attribute_stats, table, attr_idx, value)
317+
// }
318+
// Bound::Excluded(value) => {
319+
// Self::get_attribute_leq_value_freq(attribute_stats, value)
320+
// }
321+
// };
322+
// let right_quantile = match end {
323+
// Bound::Unbounded => 1.0,
324+
// Bound::Included(value) => {
325+
// Self::get_attribute_leq_value_freq(attribute_stats, value)
326+
// }
327+
// Bound::Excluded(value) => {
328+
// self.get_attribute_lt_value_freq(attribute_stats, table, attr_idx, value)
329+
// }
330+
// };
331+
// assert!(
332+
// left_quantile <= right_quantile,
333+
// "left_quantile ({}) should be <= right_quantile ({})",
334+
// left_quantile,
335+
// right_quantile
336+
// );
337+
// right_quantile - left_quantile
345338
}
346339

347340
/// Compute the selectivity of a (NOT) LIKE expression.

optd-cost-model/src/cost_model.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use crate::{
1212
nodes::{ArcPredicateNode, PhysicalNodeType},
1313
types::{AttrId, EpochId, ExprId, TableId},
1414
},
15+
stats::AttributeCombValueStats,
1516
storage::CostModelStorageManager,
1617
ComputeCostContext, Cost, CostModel, CostModelResult, EstimatedStatistic, StatValue,
1718
};
@@ -67,7 +68,6 @@ impl<S: CostModelStorageLayer + std::marker::Sync + 'static> CostModel for CostM
6768

6869
fn get_table_statistic_for_analysis(
6970
&self,
70-
// TODO: i32 should be changed to TableId.
7171
table_id: TableId,
7272
stat_type: StatType,
7373
epoch_id: Option<EpochId>,
@@ -92,3 +92,17 @@ impl<S: CostModelStorageLayer + std::marker::Sync + 'static> CostModel for CostM
9292
todo!()
9393
}
9494
}
95+
96+
impl<S: CostModelStorageLayer> CostModelImpl<S> {
97+
/// TODO: documentation
98+
/// TODO: if we have memory cache,
99+
/// we should add the reference. (&AttributeCombValueStats)
100+
pub(crate) fn get_attribute_comb_stats(
101+
&self,
102+
table_id: TableId,
103+
attr_comb: &[usize],
104+
) -> CostModelResult<AttributeCombValueStats> {
105+
self.storage_manager
106+
.get_attributes_comb_statistics(table_id, attr_comb)
107+
}
108+
}

optd-cost-model/src/stats/mod.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,19 @@ impl MostCommonValues {
2323
// is O(1) instead of O(n) and total_freq() can be cached)
2424
// additionally, it makes sense to return an Option<f64> for freq() instead of just 0 if value
2525
// doesn't exist thus, I expose three different functions
26-
fn freq(&self, value: &AttributeCombValue) -> Option<f64> {
26+
pub fn freq(&self, value: &AttributeCombValue) -> Option<f64> {
2727
match self {
2828
MostCommonValues::Counter(counter) => counter.frequencies().get(value).copied(),
2929
}
3030
}
31-
fn total_freq(&self) -> f64 {
31+
32+
pub fn total_freq(&self) -> f64 {
3233
match self {
3334
MostCommonValues::Counter(counter) => counter.frequencies().values().sum(),
3435
}
3536
}
3637

37-
fn freq_over_pred(&self, pred: Box<dyn Fn(&AttributeCombValue) -> bool>) -> f64 {
38+
pub fn freq_over_pred(&self, pred: Box<dyn Fn(&AttributeCombValue) -> bool>) -> f64 {
3839
match self {
3940
MostCommonValues::Counter(counter) => counter
4041
.frequencies()
@@ -46,7 +47,7 @@ impl MostCommonValues {
4647
}
4748

4849
// returns the # of entries (i.e. value + freq) in the most common values structure
49-
fn cnt(&self) -> usize {
50+
pub fn cnt(&self) -> usize {
5051
match self {
5152
MostCommonValues::Counter(counter) => counter.frequencies().len(),
5253
}

optd-cost-model/src/storage.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use serde::{Deserialize, Serialize};
66

77
use crate::{
88
common::{predicates::constant_pred::ConstantType, types::TableId},
9+
stats::AttributeCombValueStats,
910
CostModelResult,
1011
};
1112

@@ -39,11 +40,24 @@ impl<S: CostModelStorageLayer> CostModelStorageManager<S> {
3940
}
4041

4142
/// TODO: documentation
43+
/// TODO: if we have memory cache,
44+
/// we should add the reference. (&Field)
4245
pub fn get_attribute_info(
4346
&self,
4447
table_id: TableId,
4548
attribute_base_index: i32,
4649
) -> CostModelResult<Field> {
4750
todo!()
4851
}
52+
53+
/// TODO: documentation
54+
/// TODO: if we have memory cache,
55+
/// we should add the reference. (&AttributeCombValueStats)
56+
pub fn get_attributes_comb_statistics(
57+
&self,
58+
table_id: TableId,
59+
attr_comb: &[usize],
60+
) -> CostModelResult<AttributeCombValueStats> {
61+
todo!()
62+
}
4963
}

0 commit comments

Comments
 (0)