Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
c92a3af
Partial migration of filter
lanlou1554 Nov 14, 2024
7b0158c
Change col to attr
lanlou1554 Nov 14, 2024
81f8d50
implement cost computation for limit
xx01cyx Nov 14, 2024
2a5740e
add author
xx01cyx Nov 14, 2024
f7f6857
introduce ColumnCombValueStats
xx01cyx Nov 14, 2024
be430ac
refactor AttributeCombValueStats and introduce statistic-related data…
xx01cyx Nov 14, 2024
089cfef
Change col to attr in filter
lanlou1554 Nov 14, 2024
69607f1
Complete partial implementation of filter
lanlou1554 Nov 14, 2024
6518e00
Add get_attribute_comb_stats
lanlou1554 Nov 14, 2024
59a8889
Finish first draft version of filter functionality
lanlou1554 Nov 14, 2024
5070a78
Add comment for the guideline of re-designing PredicateNode
lanlou1554 Nov 15, 2024
740ab11
introduce IdPred and make AttributeRefPred store table id and attr index
xx01cyx Nov 15, 2024
85cd0d1
add get method for id pred and add comments
xx01cyx Nov 15, 2024
7775b88
add check for derived column in AttributeRefPred
xx01cyx Nov 15, 2024
b60c632
make get_attributes_comb_statistics return Option
xx01cyx Nov 15, 2024
3646eca
implement agg cost computation
xx01cyx Nov 15, 2024
db555ff
move filter-related constants to stats crate
xx01cyx Nov 15, 2024
64f4a10
fix clippy
xx01cyx Nov 15, 2024
cafd01c
Resolve the optional comb stats, remove table id in filter
lanlou1554 Nov 15, 2024
5c5a40f
Refactor filter implementation
lanlou1554 Nov 15, 2024
dd6598a
Resolve conflict with main
lanlou1554 Nov 16, 2024
03b6ec3
Refactor cost model storage
lanlou1554 Nov 16, 2024
a3b8088
Move storage attribute to mod
lanlou1554 Nov 16, 2024
c07b9fc
Add initial test framework in cost_model.rs
lanlou1554 Nov 16, 2024
86f6fc2
Fix typo in initial test framework
lanlou1554 Nov 16, 2024
2c1f09b
Modify initial test framework
lanlou1554 Nov 17, 2024
ebab829
Finish most tests for filter
lanlou1554 Nov 17, 2024
a8f92c3
Finish all tests for filter
lanlou1554 Nov 17, 2024
2c9240f
Add important tricky todo
lanlou1554 Nov 17, 2024
d6e1825
Improve filter tests
lanlou1554 Nov 17, 2024
082f0be
refine test infra
xx01cyx Nov 17, 2024
e183f02
add test for cost model agg
xx01cyx Nov 17, 2024
0059141
make all data types u64 instead of usize
xx01cyx Nov 17, 2024
303d73c
merge main and resolve conflicts
xx01cyx Nov 18, 2024
ec0afa6
copy paste join cardinality calculation
xx01cyx Nov 18, 2024
6d50843
make join compile
xx01cyx Nov 18, 2024
a4ff526
rename col -> attr
xx01cyx Nov 18, 2024
0ba4132
refactor join to not pass in logical props
xx01cyx Nov 18, 2024
ab15f05
make statistics f64 instead of u64
xx01cyx Nov 18, 2024
b682c73
split join into multiple files
xx01cyx Nov 18, 2024
5d73141
reorganize join
xx01cyx Nov 18, 2024
51f917d
refine test infra
xx01cyx Nov 18, 2024
5197090
add test infra for join
xx01cyx Nov 18, 2024
68b2885
refine mock interface
xx01cyx Nov 18, 2024
36b93b9
make CostModelStorageManagerImpl::get_attribute_info unimplemented
xx01cyx Nov 18, 2024
11a3a4e
modify MemoExt interface
xx01cyx Nov 18, 2024
8c4191f
rename AttrRefPred -> AttrIndexPred and revert back to initial design
xx01cyx Nov 18, 2024
1569fc5
Modify the tests of filter and agg
lanlou1554 Nov 19, 2024
489ff48
add join test
xx01cyx Nov 19, 2024
be71afb
pass group id to join and fix filter-related tests
xx01cyx Nov 19, 2024
624d040
fix all join tests
xx01cyx Nov 19, 2024
f8a0e70
Change filter controller name
lanlou1554 Nov 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions optd-cost-model/src/common/predicates/constant_pred.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::sync::Arc;

use arrow_schema::{DataType, IntervalUnit};
use optd_persistent::cost_model::interface::AttrType;
use serde::{Deserialize, Serialize};

use crate::common::{
Expand Down Expand Up @@ -90,6 +91,15 @@ impl ConstantType {
ConstantType::Utf8String => DataType::Utf8,
}
}

pub fn from_persistent_attr_type(attr_type: AttrType) -> Self {
match attr_type {
AttrType::Integer => ConstantType::Int32,
AttrType::Float => ConstantType::Float64,
AttrType::Varchar => ConstantType::Utf8String,
AttrType::Boolean => ConstantType::Bool,
}
}
}

#[derive(Clone, Debug)]
Expand Down
20 changes: 11 additions & 9 deletions optd-cost-model/src/cost/agg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ use crate::{
},
cost_model::CostModelImpl,
stats::DEFAULT_NUM_DISTINCT,
CostModelError, CostModelResult, EstimatedStatistic,
CostModelError, CostModelResult, EstimatedStatistic, SemanticError,
};

impl<S: CostModelStorageLayer> CostModelImpl<S> {
pub fn get_agg_row_cnt(
pub async fn get_agg_row_cnt(
&self,
group_by: ArcPredicateNode,
) -> CostModelResult<EstimatedStatistic> {
Expand All @@ -22,22 +22,24 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
} else {
// Multiply the n-distinct of all the group by columns.
// TODO: improve with multi-dimensional n-distinct
let row_cnt = group_by.0.children.iter().try_fold(1, |acc, node| {
let mut row_cnt = 1;

for node in &group_by.0.children {
match node.typ {
PredicateType::AttributeRef => {
let attr_ref =
AttributeRefPred::from_pred_node(node.clone()).ok_or_else(|| {
CostModelError::InvalidPredicate(
SemanticError::InvalidPredicate(
"Expected AttributeRef predicate".to_string(),
)
})?;
if attr_ref.is_derived() {
Ok(acc * DEFAULT_NUM_DISTINCT)
row_cnt *= DEFAULT_NUM_DISTINCT;
} else {
let table_id = attr_ref.table_id();
let attr_idx = attr_ref.attr_index();
let stats_option =
self.get_attribute_comb_stats(table_id, &[attr_idx])?;
self.get_attribute_comb_stats(table_id, &[attr_idx]).await?;

let ndistinct = match stats_option {
Some(stats) => stats.ndistinct,
Expand All @@ -46,15 +48,15 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
DEFAULT_NUM_DISTINCT
}
};
Ok(acc * ndistinct)
row_cnt *= ndistinct;
}
}
_ => {
// TODO: Consider the case where `GROUP BY 1`.
panic!("GROUP BY must have attribute ref predicate")
panic!("GROUP BY must have attribute ref predicate");
}
}
})?;
}
Ok(EstimatedStatistic(row_cnt))
}
}
Expand Down
50 changes: 30 additions & 20 deletions optd-cost-model/src/cost/filter/attribute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
/// Also, get_attribute_equality_selectivity is a subroutine when computing range
/// selectivity, which is another reason for separating these into two functions
/// is_eq means whether it's == or !=
pub(crate) fn get_attribute_equality_selectivity(
pub(crate) async fn get_attribute_equality_selectivity(
&self,
table_id: TableId,
attr_base_index: usize,
Expand All @@ -28,8 +28,9 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
) -> CostModelResult<f64> {
// TODO: The attribute could be a derived attribute
let ret_sel = {
if let Some(attribute_stats) =
self.get_attribute_comb_stats(table_id, &[attr_base_index])?
if let Some(attribute_stats) = self
.get_attribute_comb_stats(table_id, &[attr_base_index])
.await?
{
let eq_freq =
if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())]) {
Expand Down Expand Up @@ -91,7 +92,7 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
}

/// Compute the frequency of values in a attribute less than the given value.
fn get_attribute_lt_value_freq(
async fn get_attribute_lt_value_freq(
&self,
attribute_stats: &AttributeCombValueStats,
table_id: TableId,
Expand All @@ -102,7 +103,9 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
// into total_leq_cdf this logic just so happens to be the exact same logic as
// get_attribute_equality_selectivity implements
let ret_freq = Self::get_attribute_leq_value_freq(attribute_stats, value)
- self.get_attribute_equality_selectivity(table_id, attr_base_index, value, true)?;
- self
.get_attribute_equality_selectivity(table_id, attr_base_index, value, true)
.await?;
assert!(
(0.0..=1.0).contains(&ret_freq),
"ret_freq ({}) should be in [0, 1]",
Expand All @@ -116,25 +119,29 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
/// Range predicates are handled entirely differently from equality predicates so this is its
/// own function. If it is unable to find the statistics, it returns DEFAULT_INEQ_SEL.
/// The selectivity is computed as quantile of the right bound minus quantile of the left bound.
pub(crate) fn get_attribute_range_selectivity(
pub(crate) async fn get_attribute_range_selectivity(
&self,
table_id: TableId,
attr_base_index: usize,
start: Bound<&Value>,
end: Bound<&Value>,
) -> CostModelResult<f64> {
// TODO: Consider attribute is a derived attribute
if let Some(attribute_stats) =
self.get_attribute_comb_stats(table_id, &[attr_base_index])?
if let Some(attribute_stats) = self
.get_attribute_comb_stats(table_id, &[attr_base_index])
.await?
{
let left_quantile = match start {
Bound::Unbounded => 0.0,
Bound::Included(value) => self.get_attribute_lt_value_freq(
&attribute_stats,
table_id,
attr_base_index,
value,
)?,
Bound::Included(value) => {
self.get_attribute_lt_value_freq(
&attribute_stats,
table_id,
attr_base_index,
value,
)
.await?
}
Bound::Excluded(value) => {
Self::get_attribute_leq_value_freq(&attribute_stats, value)
}
Expand All @@ -144,12 +151,15 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
Bound::Included(value) => {
Self::get_attribute_leq_value_freq(&attribute_stats, value)
}
Bound::Excluded(value) => self.get_attribute_lt_value_freq(
&attribute_stats,
table_id,
attr_base_index,
value,
)?,
Bound::Excluded(value) => {
self.get_attribute_lt_value_freq(
&attribute_stats,
table_id,
attr_base_index,
value,
)
.await?
}
};
assert!(
left_quantile <= right_quantile,
Expand Down
42 changes: 28 additions & 14 deletions optd-cost-model/src/cost/filter/comp_op.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ use crate::{
// compute the selectivity.
stats::{DEFAULT_EQ_SEL, DEFAULT_INEQ_SEL, UNIMPLEMENTED_SEL},
CostModelResult,
SemanticError,
};

impl<S: CostModelStorageLayer> CostModelImpl<S> {
/// Comparison operators are the base case for recursion in get_filter_selectivity()
pub(crate) fn get_comp_op_selectivity(
pub(crate) async fn get_comp_op_selectivity(
&self,
comp_bin_op_typ: BinOpType,
left: ArcPredicateNode,
Expand All @@ -30,8 +31,11 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {

// I intentionally performed moves on left and right. This way, we don't accidentally use
// them after this block
let (attr_ref_exprs, values, non_attr_ref_exprs, is_left_attr_ref) =
self.get_semantic_nodes(left, right)?;
let semantic_res = self.get_semantic_nodes(left, right).await;
if semantic_res.is_err() {
return Ok(Self::get_default_comparison_op_selectivity(comp_bin_op_typ));
}
let (attr_ref_exprs, values, non_attr_ref_exprs, is_left_attr_ref) = semantic_res.unwrap();

// Handle the different cases of semantic nodes.
if attr_ref_exprs.is_empty() {
Expand All @@ -51,13 +55,17 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
match comp_bin_op_typ {
BinOpType::Eq => {
self.get_attribute_equality_selectivity(table_id, attr_ref_idx, value, true)
.await
}
BinOpType::Neq => {
self.get_attribute_equality_selectivity(
table_id,
attr_ref_idx,
value,
false,
)
.await
}
BinOpType::Neq => self.get_attribute_equality_selectivity(
table_id,
attr_ref_idx,
value,
false,
),
BinOpType::Lt | BinOpType::Leq | BinOpType::Gt | BinOpType::Geq => {
let start = match (comp_bin_op_typ, is_left_attr_ref) {
(BinOpType::Lt, true) | (BinOpType::Geq, false) => Bound::Unbounded,
Expand All @@ -74,6 +82,7 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
_ => unreachable!("all comparison BinOpTypes were enumerated. this should be unreachable"),
};
self.get_attribute_range_selectivity(table_id, attr_ref_idx, start, end)
.await
}
_ => unreachable!(
"all comparison BinOpTypes were enumerated. this should be unreachable"
Expand Down Expand Up @@ -109,7 +118,7 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
/// This is convenient to avoid repeating the same logic just with "left" and "right" swapped.
/// The last return value is true when the input node (left) is a AttributeRefPred.
#[allow(clippy::type_complexity)]
fn get_semantic_nodes(
async fn get_semantic_nodes(
&self,
left: ArcPredicateNode,
right: ArcPredicateNode,
Expand Down Expand Up @@ -175,11 +184,16 @@ impl<S: CostModelStorageLayer> CostModelImpl<S> {
// The "invert" cast is to invert the cast so that we're casting the
// non_cast_node to the attribute's original type.
// TODO(migration): double check
let invert_cast_data_type = &(self
// TODO: Consider attribute info is None.
let attribute_info = self
.storage_manager
.get_attribute_info(table_id, attr_ref_idx as i32)?
.typ
.into_data_type());
.get_attribute_info(table_id, attr_ref_idx as i32)
.await?
.ok_or({
SemanticError::AttributeNotFound(table_id, attr_ref_idx as i32)
})?;

let invert_cast_data_type = &attribute_info.typ.into_data_type();

match non_cast_node.typ {
PredicateType::AttributeRef => {
Expand Down
Loading