Skip to content

Commit cbddfd0

Browse files
authored
feat(cost-model): Add filter implementation (#41)
* Add filter implementation * modify the comment
1 parent efd01f8 commit cbddfd0

File tree

9 files changed

+1795
-0
lines changed

9 files changed

+1795
-0
lines changed

optd-cost-model/src/cost/filter.rs

Whitespace-only changes.
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
use std::ops::Bound;
2+
3+
use crate::{
4+
common::{types::TableId, values::Value},
5+
cost_model::CostModelImpl,
6+
stats::{AttributeCombValue, AttributeCombValueStats, DEFAULT_EQ_SEL, DEFAULT_INEQ_SEL},
7+
storage::CostModelStorageManager,
8+
CostModelResult,
9+
};
10+
11+
impl<S: CostModelStorageManager> CostModelImpl<S> {
12+
/// Get the selectivity of an expression of the form "attribute equals value" (or "value equals
13+
/// attribute") Will handle the case of statistics missing
14+
/// Equality predicates are handled entirely differently from range predicates so this is its
15+
/// own function
16+
/// Also, get_attribute_equality_selectivity is a subroutine when computing range selectivity,
17+
/// which is another reason for separating these into two functions is_eq means whether it's == or !=
18+
///
19+
/// Currently, we only support calculating the equality selectivity for an existed attribute,
20+
/// not a derived attribute.
21+
/// TODO: Support derived attributes.
22+
pub(crate) async fn get_attribute_equality_selectivity(
23+
&self,
24+
table_id: TableId,
25+
attr_base_index: u64,
26+
value: &Value,
27+
is_eq: bool,
28+
) -> CostModelResult<f64> {
29+
let ret_sel = {
30+
if let Some(attribute_stats) = self
31+
.get_attribute_comb_stats(table_id, &[attr_base_index])
32+
.await?
33+
{
34+
let eq_freq =
35+
if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())]) {
36+
freq
37+
} else {
38+
let non_mcv_freq = 1.0 - attribute_stats.mcvs.total_freq();
39+
// always safe because usize is at least as large as i32
40+
let ndistinct_as_usize = attribute_stats.ndistinct as usize;
41+
let non_mcv_cnt = ndistinct_as_usize - attribute_stats.mcvs.cnt();
42+
if non_mcv_cnt == 0 {
43+
return Ok(0.0);
44+
}
45+
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt
46+
// - 1 if null_frac > 0
47+
(non_mcv_freq - attribute_stats.null_frac) / (non_mcv_cnt as f64)
48+
};
49+
if is_eq {
50+
eq_freq
51+
} else {
52+
1.0 - eq_freq - attribute_stats.null_frac
53+
}
54+
} else {
55+
#[allow(clippy::collapsible_else_if)]
56+
if is_eq {
57+
DEFAULT_EQ_SEL
58+
} else {
59+
1.0 - DEFAULT_EQ_SEL
60+
}
61+
}
62+
};
63+
64+
assert!(
65+
(0.0..=1.0).contains(&ret_sel),
66+
"ret_sel ({}) should be in [0, 1]",
67+
ret_sel
68+
);
69+
Ok(ret_sel)
70+
}
71+
72+
/// Compute the frequency of values in a attribute less than or equal to the given value.
73+
fn get_attribute_leq_value_freq(
74+
per_attribute_stats: &AttributeCombValueStats,
75+
value: &Value,
76+
) -> f64 {
77+
// because distr does not include the values in MCVs, we need to compute the CDFs there as
78+
// well because nulls return false in any comparison, they are never included when
79+
// computing range selectivity
80+
let distr_leq_freq = per_attribute_stats.distr.as_ref().unwrap().cdf(value);
81+
let value = value.clone();
82+
let pred = Box::new(move |val: &AttributeCombValue| *val[0].as_ref().unwrap() <= value);
83+
let mcvs_leq_freq = per_attribute_stats.mcvs.freq_over_pred(pred);
84+
let ret_freq = distr_leq_freq + mcvs_leq_freq;
85+
assert!(
86+
(0.0..=1.0).contains(&ret_freq),
87+
"ret_freq ({}) should be in [0, 1]",
88+
ret_freq
89+
);
90+
ret_freq
91+
}
92+
93+
/// Compute the frequency of values in a attribute less than the given value.
94+
///
95+
/// Currently, we only support calculating the equality selectivity for an existed attribute,
96+
/// not a derived attribute.
97+
/// TODO: Support derived attributes.
98+
async fn get_attribute_lt_value_freq(
99+
&self,
100+
attribute_stats: &AttributeCombValueStats,
101+
table_id: TableId,
102+
attr_base_index: u64,
103+
value: &Value,
104+
) -> CostModelResult<f64> {
105+
// depending on whether value is in mcvs or not, we use different logic to turn total_lt_cdf
106+
// into total_leq_cdf this logic just so happens to be the exact same logic as
107+
// get_attribute_equality_selectivity implements
108+
let ret_freq = Self::get_attribute_leq_value_freq(attribute_stats, value)
109+
- self
110+
.get_attribute_equality_selectivity(table_id, attr_base_index, value, true)
111+
.await?;
112+
assert!(
113+
(0.0..=1.0).contains(&ret_freq),
114+
"ret_freq ({}) should be in [0, 1]",
115+
ret_freq
116+
);
117+
Ok(ret_freq)
118+
}
119+
120+
/// Get the selectivity of an expression of the form "attribute </<=/>=/> value" (or "value
121+
/// </<=/>=/> attribute"). Computes selectivity based off of statistics.
122+
/// Range predicates are handled entirely differently from equality predicates so this is its
123+
/// own function. If it is unable to find the statistics, it returns DEFAULT_INEQ_SEL.
124+
/// The selectivity is computed as quantile of the right bound minus quantile of the left bound.
125+
///
126+
/// Currently, we only support calculating the equality selectivity for an existed attribute,
127+
/// not a derived attribute.
128+
/// TODO: Support derived attributes.
129+
pub(crate) async fn get_attribute_range_selectivity(
130+
&self,
131+
table_id: TableId,
132+
attr_base_index: u64,
133+
start: Bound<&Value>,
134+
end: Bound<&Value>,
135+
) -> CostModelResult<f64> {
136+
// TODO: Consider attribute is a derived attribute
137+
if let Some(attribute_stats) = self
138+
.get_attribute_comb_stats(table_id, &[attr_base_index])
139+
.await?
140+
{
141+
let left_quantile = match start {
142+
Bound::Unbounded => 0.0,
143+
Bound::Included(value) => {
144+
self.get_attribute_lt_value_freq(
145+
&attribute_stats,
146+
table_id,
147+
attr_base_index,
148+
value,
149+
)
150+
.await?
151+
}
152+
Bound::Excluded(value) => {
153+
Self::get_attribute_leq_value_freq(&attribute_stats, value)
154+
}
155+
};
156+
let right_quantile = match end {
157+
Bound::Unbounded => 1.0,
158+
Bound::Included(value) => {
159+
Self::get_attribute_leq_value_freq(&attribute_stats, value)
160+
}
161+
Bound::Excluded(value) => {
162+
self.get_attribute_lt_value_freq(
163+
&attribute_stats,
164+
table_id,
165+
attr_base_index,
166+
value,
167+
)
168+
.await?
169+
}
170+
};
171+
assert!(
172+
left_quantile <= right_quantile,
173+
"left_quantile ({}) should be <= right_quantile ({})",
174+
left_quantile,
175+
right_quantile
176+
);
177+
Ok(right_quantile - left_quantile)
178+
} else {
179+
Ok(DEFAULT_INEQ_SEL)
180+
}
181+
}
182+
}

0 commit comments

Comments
 (0)