Skip to content
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
c92a3af
Partial migration of filter
lanlou1554 Nov 14, 2024
7b0158c
Change col to attr
lanlou1554 Nov 14, 2024
81f8d50
implement cost computation for limit
xx01cyx Nov 14, 2024
2a5740e
add author
xx01cyx Nov 14, 2024
f7f6857
introduce ColumnCombValueStats
xx01cyx Nov 14, 2024
be430ac
refactor AttributeCombValueStats and introduce statistic-related data…
xx01cyx Nov 14, 2024
089cfef
Change col to attr in filter
lanlou1554 Nov 14, 2024
69607f1
Complete partial implementation of filter
lanlou1554 Nov 14, 2024
6518e00
Add get_attribute_comb_stats
lanlou1554 Nov 14, 2024
59a8889
Finish first draft version of filter functionality
lanlou1554 Nov 14, 2024
5070a78
Add comment for the guideline of re-designing PredicateNode
lanlou1554 Nov 15, 2024
740ab11
introduce IdPred and make AttributeRefPred store table id and attr index
xx01cyx Nov 15, 2024
85cd0d1
add get method for id pred and add comments
xx01cyx Nov 15, 2024
7775b88
add check for derived column in AttributeRefPred
xx01cyx Nov 15, 2024
b60c632
make get_attributes_comb_statistics return Option
xx01cyx Nov 15, 2024
3646eca
implement agg cost computation
xx01cyx Nov 15, 2024
db555ff
move filter-related constants to stats crate
xx01cyx Nov 15, 2024
64f4a10
fix clippy
xx01cyx Nov 15, 2024
cafd01c
Resolve the optional comb stats, remove table id in filter
lanlou1554 Nov 15, 2024
5c5a40f
Refactor filter implementation
lanlou1554 Nov 15, 2024
dd6598a
Resolve conflict with main
lanlou1554 Nov 16, 2024
03b6ec3
Refactor cost model storage
lanlou1554 Nov 16, 2024
a3b8088
Move storage attribute to mod
lanlou1554 Nov 16, 2024
c07b9fc
Add initial test framework in cost_model.rs
lanlou1554 Nov 16, 2024
86f6fc2
Fix typo in initial test framework
lanlou1554 Nov 16, 2024
2c1f09b
Modify initial test framework
lanlou1554 Nov 17, 2024
ebab829
Finish most tests for filter
lanlou1554 Nov 17, 2024
a8f92c3
Finish all tests for filter
lanlou1554 Nov 17, 2024
2c9240f
Add important tricky todo
lanlou1554 Nov 17, 2024
d6e1825
Improve filter tests
lanlou1554 Nov 17, 2024
082f0be
refine test infra
xx01cyx Nov 17, 2024
e183f02
add test for cost model agg
xx01cyx Nov 17, 2024
0059141
make all data types u64 instead of usize
xx01cyx Nov 17, 2024
303d73c
merge main and resolve conflicts
xx01cyx Nov 18, 2024
ec0afa6
copy paste join cardinality calculation
xx01cyx Nov 18, 2024
6d50843
make join compile
xx01cyx Nov 18, 2024
a4ff526
rename col -> attr
xx01cyx Nov 18, 2024
0ba4132
refactor join to not pass in logical props
xx01cyx Nov 18, 2024
ab15f05
make statistics f64 instead of u64
xx01cyx Nov 18, 2024
b682c73
split join into multiple files
xx01cyx Nov 18, 2024
5d73141
reorganize join
xx01cyx Nov 18, 2024
51f917d
refine test infra
xx01cyx Nov 18, 2024
5197090
add test infra for join
xx01cyx Nov 18, 2024
68b2885
refine mock interface
xx01cyx Nov 18, 2024
36b93b9
make CostModelStorageManagerImpl::get_attribute_info unimplemented
xx01cyx Nov 18, 2024
11a3a4e
modify MemoExt interface
xx01cyx Nov 18, 2024
8c4191f
rename AttrRefPred -> AttrIndexPred and revert back to initial design
xx01cyx Nov 18, 2024
1569fc5
Modify the tests of filter and agg
lanlou1554 Nov 19, 2024
489ff48
add join test
xx01cyx Nov 19, 2024
be71afb
pass group id to join and fix filter-related tests
xx01cyx Nov 19, 2024
624d040
fix all join tests
xx01cyx Nov 19, 2024
f8a0e70
Change filter controller name
lanlou1554 Nov 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
665 changes: 662 additions & 3 deletions Cargo.lock

Large diffs are not rendered by default.

675 changes: 656 additions & 19 deletions optd-cost-model/Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions optd-cost-model/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
name = "optd-cost-model"
version = "0.1.0"
edition = "2021"
authors = ["Yuanxin Cao", "Lan Lou", "Kunle Li"]

[dependencies]
optd-persistent = { path = "../optd-persistent", version = "0.1" }
Expand All @@ -10,6 +11,7 @@ serde_json = "1.0"
serde_with = { version = "3.7.0", features = ["json"] }
arrow-schema = "53.2.0"
datafusion-expr = "32.0.0"
datafusion = "32.0.0"
ordered-float = "4.0"
chrono = "0.4"
itertools = "0.13"
Expand Down
29 changes: 28 additions & 1 deletion optd-cost-model/src/common/nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ pub enum PredicateType {
Constant(ConstantType),
AttributeRef,
ExternAttributeRef,
// TODO(lanlou): Id -> Id(IdType)
Id,
UnOp(UnOpType),
BinOp(BinOpType),
LogOp(LogOpType),
Expand All @@ -77,7 +79,7 @@ pub struct PredicateNode {
/// A generic predicate node type
pub typ: PredicateType,
/// Child predicate nodes, always materialized
pub children: Vec<PredicateNode>,
pub children: Vec<ArcPredicateNode>,
/// Data associated with the predicate, if any
pub data: Option<Value>,
}
Expand All @@ -94,3 +96,28 @@ impl std::fmt::Display for PredicateNode {
write!(f, ")")
}
}

impl PredicateNode {
pub fn child(&self, idx: usize) -> ArcPredicateNode {
self.children[idx].clone()
}

pub fn unwrap_data(&self) -> Value {
self.data.clone().unwrap()
}
}
pub trait ReprPredicateNode: 'static + Clone {
fn into_pred_node(self) -> ArcPredicateNode;

fn from_pred_node(pred_node: ArcPredicateNode) -> Option<Self>;
}

impl ReprPredicateNode for ArcPredicateNode {
fn into_pred_node(self) -> ArcPredicateNode {
self
}

fn from_pred_node(pred_node: ArcPredicateNode) -> Option<Self> {
Some(pred_node)
}
}
69 changes: 69 additions & 0 deletions optd-cost-model/src/common/predicates/attr_ref_pred.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
use crate::common::{
nodes::{ArcPredicateNode, PredicateNode, PredicateType, ReprPredicateNode},
types::TableId,
};

use super::id_pred::IdPred;

/// [`AttributeRefPred`] represents a reference to a column in a relation.
///
/// An [`AttributeRefPred`] has two children:
/// 1. The table id, represented by an [`IdPred`].
/// 2. The index of the column, represented by an [`IdPred`].
///
/// **TODO**: Now we assume any IdPred is as same as the ones in the ORM layer.
///
/// Currently, [`AttributeRefPred`] only holds base table attributes, i.e. attributes
/// that already exist in the table. More complex structures may be introduced in the
/// future to represent derived attributes (e.g. t.v1 + t.v2).
///
/// TODO: Support derived column in `AttributeRefPred`.
/// Proposal: Data field can store the column type (base or derived).
#[derive(Clone, Debug)]
pub struct AttributeRefPred(pub ArcPredicateNode);

impl AttributeRefPred {
pub fn new(table_id: usize, attribute_idx: usize) -> AttributeRefPred {
AttributeRefPred(
PredicateNode {
typ: PredicateType::AttributeRef,
children: vec![
IdPred::new(table_id).into_pred_node(),
IdPred::new(attribute_idx).into_pred_node(),
],
data: None,
}
.into(),
)
}

/// Gets the table id.
pub fn table_id(&self) -> TableId {
TableId(self.0.child(0).data.as_ref().unwrap().as_u64() as usize)
}

/// Gets the attribute index.
/// Note: The attribute index is the **base** index, which is table specific.
pub fn attr_index(&self) -> usize {
self.0.child(1).data.as_ref().unwrap().as_u64() as usize
}

/// Checks whether the attribute is a derived attribute. Currently, this will always return
/// false, since derived attribute is not yet supported.
pub fn is_derived(&self) -> bool {
false
}
}

impl ReprPredicateNode for AttributeRefPred {
fn into_pred_node(self) -> ArcPredicateNode {
self.0
}

fn from_pred_node(pred_node: ArcPredicateNode) -> Option<Self> {
if pred_node.typ != PredicateType::AttributeRef {
return None;
}
Some(Self(pred_node))
}
}
49 changes: 49 additions & 0 deletions optd-cost-model/src/common/predicates/cast_pred.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use arrow_schema::DataType;

use crate::common::nodes::{ArcPredicateNode, PredicateNode, PredicateType, ReprPredicateNode};

use super::data_type_pred::DataTypePred;

/// [`CastPred`] casts a column from one data type to another.
///
/// A [`CastPred`] has two children:
/// 1. The original data to cast
/// 2. The target data type to cast to
#[derive(Clone, Debug)]
pub struct CastPred(pub ArcPredicateNode);

impl CastPred {
pub fn new(child: ArcPredicateNode, cast_to: DataType) -> Self {
CastPred(
PredicateNode {
typ: PredicateType::Cast,
children: vec![child, DataTypePred::new(cast_to).into_pred_node()],
data: None,
}
.into(),
)
}

pub fn child(&self) -> ArcPredicateNode {
self.0.child(0)
}

pub fn cast_to(&self) -> DataType {
DataTypePred::from_pred_node(self.0.child(1))
.unwrap()
.data_type()
}
}

impl ReprPredicateNode for CastPred {
fn into_pred_node(self) -> ArcPredicateNode {
self.0
}

fn from_pred_node(pred_node: ArcPredicateNode) -> Option<Self> {
if !matches!(pred_node.typ, PredicateType::Cast) {
return None;
}
Some(Self(pred_node))
}
}
199 changes: 199 additions & 0 deletions optd-cost-model/src/common/predicates/constant_pred.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
use std::sync::Arc;

use arrow_schema::{DataType, IntervalUnit};
use optd_persistent::cost_model::interface::AttrType;
use serde::{Deserialize, Serialize};

use crate::common::{
nodes::{ArcPredicateNode, PredicateNode, PredicateType, ReprPredicateNode},
values::{SerializableOrderedF64, Value},
};

/// TODO: documentation
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)]
pub enum ConstantType {
Expand All @@ -19,3 +28,193 @@ pub enum ConstantType {
Decimal,
Binary,
}

impl ConstantType {
pub fn get_data_type_from_value(value: &Value) -> Self {
match value {
Value::Bool(_) => ConstantType::Bool,
Value::String(_) => ConstantType::Utf8String,
Value::UInt8(_) => ConstantType::UInt8,
Value::UInt16(_) => ConstantType::UInt16,
Value::UInt32(_) => ConstantType::UInt32,
Value::UInt64(_) => ConstantType::UInt64,
Value::Int8(_) => ConstantType::Int8,
Value::Int16(_) => ConstantType::Int16,
Value::Int32(_) => ConstantType::Int32,
Value::Int64(_) => ConstantType::Int64,
Value::Float(_) => ConstantType::Float64,
Value::Date32(_) => ConstantType::Date,
_ => unimplemented!("get_data_type_from_value() not implemented for value {value}"),
}
}

// TODO: current DataType and ConstantType are not 1 to 1 mapping
// optd schema stores constantType from data type in catalog.get
// for decimal128, the precision is lost
pub fn from_data_type(data_type: DataType) -> Self {
match data_type {
DataType::Binary => ConstantType::Binary,
DataType::Boolean => ConstantType::Bool,
DataType::UInt8 => ConstantType::UInt8,
DataType::UInt16 => ConstantType::UInt16,
DataType::UInt32 => ConstantType::UInt32,
DataType::UInt64 => ConstantType::UInt64,
DataType::Int8 => ConstantType::Int8,
DataType::Int16 => ConstantType::Int16,
DataType::Int32 => ConstantType::Int32,
DataType::Int64 => ConstantType::Int64,
DataType::Float64 => ConstantType::Float64,
DataType::Date32 => ConstantType::Date,
DataType::Interval(IntervalUnit::MonthDayNano) => ConstantType::IntervalMonthDateNano,
DataType::Utf8 => ConstantType::Utf8String,
DataType::Decimal128(_, _) => ConstantType::Decimal,
_ => unimplemented!("no conversion to ConstantType for DataType {data_type}"),
}
}

pub fn into_data_type(&self) -> DataType {
match self {
ConstantType::Binary => DataType::Binary,
ConstantType::Bool => DataType::Boolean,
ConstantType::UInt8 => DataType::UInt8,
ConstantType::UInt16 => DataType::UInt16,
ConstantType::UInt32 => DataType::UInt32,
ConstantType::UInt64 => DataType::UInt64,
ConstantType::Int8 => DataType::Int8,
ConstantType::Int16 => DataType::Int16,
ConstantType::Int32 => DataType::Int32,
ConstantType::Int64 => DataType::Int64,
ConstantType::Float64 => DataType::Float64,
ConstantType::Date => DataType::Date32,
ConstantType::IntervalMonthDateNano => DataType::Interval(IntervalUnit::MonthDayNano),
ConstantType::Decimal => DataType::Float64,
ConstantType::Utf8String => DataType::Utf8,
}
}

pub fn from_persistent_attr_type(attr_type: AttrType) -> Self {
match attr_type {
AttrType::Integer => ConstantType::Int32,
AttrType::Float => ConstantType::Float64,
AttrType::Varchar => ConstantType::Utf8String,
AttrType::Boolean => ConstantType::Bool,
}
}
}

#[derive(Clone, Debug)]
pub struct ConstantPred(pub ArcPredicateNode);

impl ConstantPred {
pub fn new(value: Value) -> Self {
let typ = ConstantType::get_data_type_from_value(&value);
Self::new_with_type(value, typ)
}

pub fn new_with_type(value: Value, typ: ConstantType) -> Self {
ConstantPred(
PredicateNode {
typ: PredicateType::Constant(typ),
children: vec![],
data: Some(value),
}
.into(),
)
}

pub fn bool(value: bool) -> Self {
Self::new_with_type(Value::Bool(value), ConstantType::Bool)
}

pub fn string(value: impl AsRef<str>) -> Self {
Self::new_with_type(
Value::String(value.as_ref().into()),
ConstantType::Utf8String,
)
}

pub fn uint8(value: u8) -> Self {
Self::new_with_type(Value::UInt8(value), ConstantType::UInt8)
}

pub fn uint16(value: u16) -> Self {
Self::new_with_type(Value::UInt16(value), ConstantType::UInt16)
}

pub fn uint32(value: u32) -> Self {
Self::new_with_type(Value::UInt32(value), ConstantType::UInt32)
}

pub fn uint64(value: u64) -> Self {
Self::new_with_type(Value::UInt64(value), ConstantType::UInt64)
}

pub fn int8(value: i8) -> Self {
Self::new_with_type(Value::Int8(value), ConstantType::Int8)
}

pub fn int16(value: i16) -> Self {
Self::new_with_type(Value::Int16(value), ConstantType::Int16)
}

pub fn int32(value: i32) -> Self {
Self::new_with_type(Value::Int32(value), ConstantType::Int32)
}

pub fn int64(value: i64) -> Self {
Self::new_with_type(Value::Int64(value), ConstantType::Int64)
}

pub fn interval_month_day_nano(value: i128) -> Self {
Self::new_with_type(Value::Int128(value), ConstantType::IntervalMonthDateNano)
}

pub fn float64(value: f64) -> Self {
Self::new_with_type(
Value::Float(SerializableOrderedF64(value.into())),
ConstantType::Float64,
)
}

pub fn date(value: i64) -> Self {
Self::new_with_type(Value::Int64(value), ConstantType::Date)
}

pub fn decimal(value: f64) -> Self {
Self::new_with_type(
Value::Float(SerializableOrderedF64(value.into())),
ConstantType::Decimal,
)
}

pub fn serialized(value: Arc<[u8]>) -> Self {
Self::new_with_type(Value::Serialized(value), ConstantType::Binary)
}

/// Gets the constant value.
pub fn value(&self) -> Value {
self.0.data.clone().unwrap()
}

pub fn constant_type(&self) -> ConstantType {
if let PredicateType::Constant(typ) = self.0.typ {
typ
} else {
panic!("not a constant")
}
}
}

impl ReprPredicateNode for ConstantPred {
fn into_pred_node(self) -> ArcPredicateNode {
self.0
}

fn from_pred_node(rel_node: ArcPredicateNode) -> Option<Self> {
if let PredicateType::Constant(_) = rel_node.typ {
Some(Self(rel_node))
} else {
None
}
}
}
Loading