|
| 1 | +use serde::{Deserialize, Serialize}; |
| 2 | + |
| 3 | +use crate::common::values::Value; |
| 4 | + |
| 5 | +pub type ColumnCombValue = Vec<Option<Value>>; |
| 6 | + |
| 7 | +/// Ideally, MostCommonValues would have trait bounds for Serialize and Deserialize. However, I have |
| 8 | +/// not figured out how to both have Deserialize as a trait bound and utilize the Deserialize |
| 9 | +/// macro, because the Deserialize trait involves lifetimes. |
| 10 | +pub trait MostCommonValues: 'static + Send + Sync { |
| 11 | + // it is true that we could just expose freq_over_pred() and use that for freq() and |
| 12 | + // total_freq() however, freq() and total_freq() each have potential optimizations (freq() |
| 13 | + // is O(1) instead of O(n) and total_freq() can be cached) |
| 14 | + // additionally, it makes sense to return an Option<f64> for freq() instead of just 0 if value |
| 15 | + // doesn't exist thus, I expose three different functions |
| 16 | + fn freq(&self, value: &ColumnCombValue) -> Option<f64>; |
| 17 | + fn total_freq(&self) -> f64; |
| 18 | + fn freq_over_pred(&self, pred: Box<dyn Fn(&ColumnCombValue) -> bool>) -> f64; |
| 19 | + |
| 20 | + // returns the # of entries (i.e. value + freq) in the most common values structure |
| 21 | + fn cnt(&self) -> usize; |
| 22 | +} |
| 23 | + |
| 24 | +/// A more general interface meant to perform the task of a histogram. |
| 25 | +/// |
| 26 | +/// This more general interface is still compatible with histograms but allows |
| 27 | +/// more powerful statistics like TDigest. |
| 28 | +/// |
| 29 | +/// Ideally, Distribution would have trait bounds for Serialize and Deserialize. |
| 30 | +/// However, I have not figured out how to both have Deserialize as a trait bound |
| 31 | +/// and utilize the Deserialize macro, because the Deserialize trait involves lifetimes. |
| 32 | +pub trait Distribution: 'static + Send + Sync { |
| 33 | + // Give the probability of a random value sampled from the distribution being <= `value` |
| 34 | + fn cdf(&self, value: &Value) -> f64; |
| 35 | +} |
| 36 | + |
| 37 | +#[derive(Serialize, Deserialize, Debug)] |
| 38 | +pub struct ColumnCombValueStats<M: MostCommonValues, D: Distribution> { |
| 39 | + pub mcvs: M, // Does NOT contain full nulls. |
| 40 | + pub distr: Option<D>, // Does NOT contain mcvs; optional. |
| 41 | + pub ndistinct: u64, // Does NOT contain full nulls. |
| 42 | + pub null_frac: f64, // % of full nulls. |
| 43 | +} |
0 commit comments