Skip to content

Commit 5c5a40f

Browse files
committed
Refactor filter implementation
1 parent cafd01c commit 5c5a40f

File tree

9 files changed

+764
-690
lines changed

9 files changed

+764
-690
lines changed

optd-cost-model/src/cost/filter.rs

Lines changed: 0 additions & 690 deletions
This file was deleted.
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
use std::ops::Bound;
2+
3+
use optd_persistent::CostModelStorageLayer;
4+
5+
use crate::{
6+
common::{types::TableId, values::Value},
7+
cost_model::CostModelImpl,
8+
// TODO: If we return the default value, consider tell the upper level that we cannot
9+
// compute the selectivity.
10+
stats::{AttributeCombValue, AttributeCombValueStats, DEFAULT_EQ_SEL, DEFAULT_INEQ_SEL},
11+
CostModelResult,
12+
};
13+
14+
impl<S: CostModelStorageLayer> CostModelImpl<S> {
15+
/// Get the selectivity of an expression of the form "attribute equals value" (or "value equals
16+
/// attribute") Will handle the case of statistics missing
17+
/// Equality predicates are handled entirely differently from range predicates so this is its
18+
/// own function
19+
/// Also, get_attribute_equality_selectivity is a subroutine when computing range
20+
/// selectivity, which is another reason for separating these into two functions
21+
/// is_eq means whether it's == or !=
22+
pub(crate) fn get_attribute_equality_selectivity(
23+
&self,
24+
table_id: TableId,
25+
attr_base_index: usize,
26+
value: &Value,
27+
is_eq: bool,
28+
) -> CostModelResult<f64> {
29+
// TODO: The attribute could be a derived attribute
30+
let ret_sel = {
31+
if let Some(attribute_stats) =
32+
self.get_attribute_comb_stats(table_id, &[attr_base_index])?
33+
{
34+
let eq_freq =
35+
if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())]) {
36+
freq
37+
} else {
38+
let non_mcv_freq = 1.0 - attribute_stats.mcvs.total_freq();
39+
// always safe because usize is at least as large as i32
40+
let ndistinct_as_usize = attribute_stats.ndistinct as usize;
41+
let non_mcv_cnt = ndistinct_as_usize - attribute_stats.mcvs.cnt();
42+
if non_mcv_cnt == 0 {
43+
return Ok(0.0);
44+
}
45+
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt
46+
// - 1 if null_frac > 0
47+
(non_mcv_freq - attribute_stats.null_frac) / (non_mcv_cnt as f64)
48+
};
49+
if is_eq {
50+
eq_freq
51+
} else {
52+
1.0 - eq_freq - attribute_stats.null_frac
53+
}
54+
} else {
55+
#[allow(clippy::collapsible_else_if)]
56+
if is_eq {
57+
DEFAULT_EQ_SEL
58+
} else {
59+
1.0 - DEFAULT_EQ_SEL
60+
}
61+
}
62+
};
63+
64+
assert!(
65+
(0.0..=1.0).contains(&ret_sel),
66+
"ret_sel ({}) should be in [0, 1]",
67+
ret_sel
68+
);
69+
Ok(ret_sel)
70+
}
71+
72+
/// Compute the frequency of values in a attribute less than or equal to the given value.
73+
fn get_attribute_leq_value_freq(
74+
per_attribute_stats: &AttributeCombValueStats,
75+
value: &Value,
76+
) -> f64 {
77+
// because distr does not include the values in MCVs, we need to compute the CDFs there as
78+
// well because nulls return false in any comparison, they are never included when
79+
// computing range selectivity
80+
let distr_leq_freq = per_attribute_stats.distr.as_ref().unwrap().cdf(value);
81+
let value = value.clone();
82+
let pred = Box::new(move |val: &AttributeCombValue| *val[0].as_ref().unwrap() <= value);
83+
let mcvs_leq_freq = per_attribute_stats.mcvs.freq_over_pred(pred);
84+
let ret_freq = distr_leq_freq + mcvs_leq_freq;
85+
assert!(
86+
(0.0..=1.0).contains(&ret_freq),
87+
"ret_freq ({}) should be in [0, 1]",
88+
ret_freq
89+
);
90+
ret_freq
91+
}
92+
93+
/// Compute the frequency of values in a attribute less than the given value.
94+
fn get_attribute_lt_value_freq(
95+
&self,
96+
attribute_stats: &AttributeCombValueStats,
97+
table_id: TableId,
98+
attr_base_index: usize,
99+
value: &Value,
100+
) -> CostModelResult<f64> {
101+
// depending on whether value is in mcvs or not, we use different logic to turn total_lt_cdf
102+
// into total_leq_cdf this logic just so happens to be the exact same logic as
103+
// get_attribute_equality_selectivity implements
104+
let ret_freq = Self::get_attribute_leq_value_freq(attribute_stats, value)
105+
- self.get_attribute_equality_selectivity(table_id, attr_base_index, value, true)?;
106+
assert!(
107+
(0.0..=1.0).contains(&ret_freq),
108+
"ret_freq ({}) should be in [0, 1]",
109+
ret_freq
110+
);
111+
Ok(ret_freq)
112+
}
113+
114+
/// Get the selectivity of an expression of the form "attribute </<=/>=/> value" (or "value
115+
/// </<=/>=/> attribute"). Computes selectivity based off of statistics.
116+
/// Range predicates are handled entirely differently from equality predicates so this is its
117+
/// own function. If it is unable to find the statistics, it returns DEFAULT_INEQ_SEL.
118+
/// The selectivity is computed as quantile of the right bound minus quantile of the left bound.
119+
pub(crate) fn get_attribute_range_selectivity(
120+
&self,
121+
table_id: TableId,
122+
attr_base_index: usize,
123+
start: Bound<&Value>,
124+
end: Bound<&Value>,
125+
) -> CostModelResult<f64> {
126+
// TODO: Consider attribute is a derived attribute
127+
if let Some(attribute_stats) =
128+
self.get_attribute_comb_stats(table_id, &[attr_base_index])?
129+
{
130+
let left_quantile = match start {
131+
Bound::Unbounded => 0.0,
132+
Bound::Included(value) => self.get_attribute_lt_value_freq(
133+
&attribute_stats,
134+
table_id,
135+
attr_base_index,
136+
value,
137+
)?,
138+
Bound::Excluded(value) => {
139+
Self::get_attribute_leq_value_freq(&attribute_stats, value)
140+
}
141+
};
142+
let right_quantile = match end {
143+
Bound::Unbounded => 1.0,
144+
Bound::Included(value) => {
145+
Self::get_attribute_leq_value_freq(&attribute_stats, value)
146+
}
147+
Bound::Excluded(value) => self.get_attribute_lt_value_freq(
148+
&attribute_stats,
149+
table_id,
150+
attr_base_index,
151+
value,
152+
)?,
153+
};
154+
assert!(
155+
left_quantile <= right_quantile,
156+
"left_quantile ({}) should be <= right_quantile ({})",
157+
left_quantile,
158+
right_quantile
159+
);
160+
Ok(right_quantile - left_quantile)
161+
} else {
162+
Ok(DEFAULT_INEQ_SEL)
163+
}
164+
}
165+
}

0 commit comments

Comments
 (0)