Skip to content

Commit 795ca1f

Browse files
committed
Add filter implementation
1 parent efd01f8 commit 795ca1f

File tree

9 files changed

+1796
-0
lines changed

9 files changed

+1796
-0
lines changed

optd-cost-model/src/cost/filter.rs

Whitespace-only changes.
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
use std::ops::Bound;
2+
3+
use crate::{
4+
common::{types::TableId, values::Value},
5+
cost_model::CostModelImpl,
6+
stats::{AttributeCombValue, AttributeCombValueStats, DEFAULT_EQ_SEL, DEFAULT_INEQ_SEL},
7+
storage::CostModelStorageManager,
8+
CostModelResult,
9+
};
10+
11+
impl<S: CostModelStorageManager> CostModelImpl<S> {
12+
/// Get the selectivity of an expression of the form "attribute equals value" (or "value equals
13+
/// attribute") Will handle the case of statistics missing
14+
/// Equality predicates are handled entirely differently from range predicates so this is its
15+
/// own function
16+
/// Also, get_attribute_equality_selectivity is a subroutine when computing range
17+
/// selectivity, which is another reason for separating these into two functions
18+
/// is_eq means whether it's == or !=
19+
///
20+
/// Currently, we only support calculating the equality selectivity for an existed attribute,
21+
/// not a derived attribute.
22+
/// TODO: Support derived attributes.
23+
pub(crate) async fn get_attribute_equality_selectivity(
24+
&self,
25+
table_id: TableId,
26+
attr_base_index: u64,
27+
value: &Value,
28+
is_eq: bool,
29+
) -> CostModelResult<f64> {
30+
let ret_sel = {
31+
if let Some(attribute_stats) = self
32+
.get_attribute_comb_stats(table_id, &[attr_base_index])
33+
.await?
34+
{
35+
let eq_freq =
36+
if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())]) {
37+
freq
38+
} else {
39+
let non_mcv_freq = 1.0 - attribute_stats.mcvs.total_freq();
40+
// always safe because usize is at least as large as i32
41+
let ndistinct_as_usize = attribute_stats.ndistinct as usize;
42+
let non_mcv_cnt = ndistinct_as_usize - attribute_stats.mcvs.cnt();
43+
if non_mcv_cnt == 0 {
44+
return Ok(0.0);
45+
}
46+
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt
47+
// - 1 if null_frac > 0
48+
(non_mcv_freq - attribute_stats.null_frac) / (non_mcv_cnt as f64)
49+
};
50+
if is_eq {
51+
eq_freq
52+
} else {
53+
1.0 - eq_freq - attribute_stats.null_frac
54+
}
55+
} else {
56+
#[allow(clippy::collapsible_else_if)]
57+
if is_eq {
58+
DEFAULT_EQ_SEL
59+
} else {
60+
1.0 - DEFAULT_EQ_SEL
61+
}
62+
}
63+
};
64+
65+
assert!(
66+
(0.0..=1.0).contains(&ret_sel),
67+
"ret_sel ({}) should be in [0, 1]",
68+
ret_sel
69+
);
70+
Ok(ret_sel)
71+
}
72+
73+
/// Compute the frequency of values in a attribute less than or equal to the given value.
74+
fn get_attribute_leq_value_freq(
75+
per_attribute_stats: &AttributeCombValueStats,
76+
value: &Value,
77+
) -> f64 {
78+
// because distr does not include the values in MCVs, we need to compute the CDFs there as
79+
// well because nulls return false in any comparison, they are never included when
80+
// computing range selectivity
81+
let distr_leq_freq = per_attribute_stats.distr.as_ref().unwrap().cdf(value);
82+
let value = value.clone();
83+
let pred = Box::new(move |val: &AttributeCombValue| *val[0].as_ref().unwrap() <= value);
84+
let mcvs_leq_freq = per_attribute_stats.mcvs.freq_over_pred(pred);
85+
let ret_freq = distr_leq_freq + mcvs_leq_freq;
86+
assert!(
87+
(0.0..=1.0).contains(&ret_freq),
88+
"ret_freq ({}) should be in [0, 1]",
89+
ret_freq
90+
);
91+
ret_freq
92+
}
93+
94+
/// Compute the frequency of values in a attribute less than the given value.
95+
///
96+
/// Currently, we only support calculating the equality selectivity for an existed attribute,
97+
/// not a derived attribute.
98+
/// TODO: Support derived attributes.
99+
async fn get_attribute_lt_value_freq(
100+
&self,
101+
attribute_stats: &AttributeCombValueStats,
102+
table_id: TableId,
103+
attr_base_index: u64,
104+
value: &Value,
105+
) -> CostModelResult<f64> {
106+
// depending on whether value is in mcvs or not, we use different logic to turn total_lt_cdf
107+
// into total_leq_cdf this logic just so happens to be the exact same logic as
108+
// get_attribute_equality_selectivity implements
109+
let ret_freq = Self::get_attribute_leq_value_freq(attribute_stats, value)
110+
- self
111+
.get_attribute_equality_selectivity(table_id, attr_base_index, value, true)
112+
.await?;
113+
assert!(
114+
(0.0..=1.0).contains(&ret_freq),
115+
"ret_freq ({}) should be in [0, 1]",
116+
ret_freq
117+
);
118+
Ok(ret_freq)
119+
}
120+
121+
/// Get the selectivity of an expression of the form "attribute </<=/>=/> value" (or "value
122+
/// </<=/>=/> attribute"). Computes selectivity based off of statistics.
123+
/// Range predicates are handled entirely differently from equality predicates so this is its
124+
/// own function. If it is unable to find the statistics, it returns DEFAULT_INEQ_SEL.
125+
/// The selectivity is computed as quantile of the right bound minus quantile of the left bound.
126+
///
127+
/// Currently, we only support calculating the equality selectivity for an existed attribute,
128+
/// not a derived attribute.
129+
/// TODO: Support derived attributes.
130+
pub(crate) async fn get_attribute_range_selectivity(
131+
&self,
132+
table_id: TableId,
133+
attr_base_index: u64,
134+
start: Bound<&Value>,
135+
end: Bound<&Value>,
136+
) -> CostModelResult<f64> {
137+
// TODO: Consider attribute is a derived attribute
138+
if let Some(attribute_stats) = self
139+
.get_attribute_comb_stats(table_id, &[attr_base_index])
140+
.await?
141+
{
142+
let left_quantile = match start {
143+
Bound::Unbounded => 0.0,
144+
Bound::Included(value) => {
145+
self.get_attribute_lt_value_freq(
146+
&attribute_stats,
147+
table_id,
148+
attr_base_index,
149+
value,
150+
)
151+
.await?
152+
}
153+
Bound::Excluded(value) => {
154+
Self::get_attribute_leq_value_freq(&attribute_stats, value)
155+
}
156+
};
157+
let right_quantile = match end {
158+
Bound::Unbounded => 1.0,
159+
Bound::Included(value) => {
160+
Self::get_attribute_leq_value_freq(&attribute_stats, value)
161+
}
162+
Bound::Excluded(value) => {
163+
self.get_attribute_lt_value_freq(
164+
&attribute_stats,
165+
table_id,
166+
attr_base_index,
167+
value,
168+
)
169+
.await?
170+
}
171+
};
172+
assert!(
173+
left_quantile <= right_quantile,
174+
"left_quantile ({}) should be <= right_quantile ({})",
175+
left_quantile,
176+
right_quantile
177+
);
178+
Ok(right_quantile - left_quantile)
179+
} else {
180+
Ok(DEFAULT_INEQ_SEL)
181+
}
182+
}
183+
}

0 commit comments

Comments
 (0)