Skip to content

Commit 281d81f

Browse files
committed
introduce ColumnCombValueStats
1 parent 0bf1717 commit 281d81f

File tree

4 files changed

+46
-1
lines changed

4 files changed

+46
-1
lines changed

optd-cost-model/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-cost-model/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ arrow-schema = "53.2.0"
1111
datafusion-expr = "32.0.0"
1212
ordered-float = "4.0"
1313
chrono = "0.4"
14-
14+
serde_json = "1.0"

optd-cost-model/src/cost/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ pub mod agg;
44
pub mod filter;
55
pub mod join;
66
pub mod limit;
7+
pub mod stats;

optd-cost-model/src/cost/stats.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
use serde::{Deserialize, Serialize};
2+
3+
use crate::common::values::Value;
4+
5+
pub type ColumnCombValue = Vec<Option<Value>>;
6+
7+
/// Ideally, MostCommonValues would have trait bounds for Serialize and Deserialize. However, I have
8+
/// not figured out how to both have Deserialize as a trait bound and utilize the Deserialize
9+
/// macro, because the Deserialize trait involves lifetimes.
10+
pub trait MostCommonValues: 'static + Send + Sync {
11+
// it is true that we could just expose freq_over_pred() and use that for freq() and
12+
// total_freq() however, freq() and total_freq() each have potential optimizations (freq()
13+
// is O(1) instead of O(n) and total_freq() can be cached)
14+
// additionally, it makes sense to return an Option<f64> for freq() instead of just 0 if value
15+
// doesn't exist thus, I expose three different functions
16+
fn freq(&self, value: &ColumnCombValue) -> Option<f64>;
17+
fn total_freq(&self) -> f64;
18+
fn freq_over_pred(&self, pred: Box<dyn Fn(&ColumnCombValue) -> bool>) -> f64;
19+
20+
// returns the # of entries (i.e. value + freq) in the most common values structure
21+
fn cnt(&self) -> usize;
22+
}
23+
24+
/// A more general interface meant to perform the task of a histogram.
25+
///
26+
/// This more general interface is still compatible with histograms but allows
27+
/// more powerful statistics like TDigest.
28+
///
29+
/// Ideally, Distribution would have trait bounds for Serialize and Deserialize.
30+
/// However, I have not figured out how to both have Deserialize as a trait bound
31+
/// and utilize the Deserialize macro, because the Deserialize trait involves lifetimes.
32+
pub trait Distribution: 'static + Send + Sync {
33+
// Give the probability of a random value sampled from the distribution being <= `value`
34+
fn cdf(&self, value: &Value) -> f64;
35+
}
36+
37+
#[derive(Serialize, Deserialize, Debug)]
38+
pub struct ColumnCombValueStats<M: MostCommonValues, D: Distribution> {
39+
pub mcvs: M, // Does NOT contain full nulls.
40+
pub distr: Option<D>, // Does NOT contain mcvs; optional.
41+
pub ndistinct: u64, // Does NOT contain full nulls.
42+
pub null_frac: f64, // % of full nulls.
43+
}

0 commit comments

Comments
 (0)