Skip to content

Commit 91deb07

Browse files
authored
feat(cost-model): migrate adv-stats (#33)
1 parent 21d01ae commit 91deb07

File tree

7 files changed

+899
-4
lines changed

7 files changed

+899
-4
lines changed

optd-cost-model/Cargo.lock

Lines changed: 103 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-cost-model/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,15 @@ edition = "2021"
66
[dependencies]
77
optd-persistent = { path = "../optd-persistent", version = "0.1" }
88
serde = { version = "1.0", features = ["derive"] }
9+
serde_json = "1.0"
10+
serde_with = { version = "3.7.0", features = ["json"] }
911
arrow-schema = "53.2.0"
1012
datafusion-expr = "32.0.0"
1113
ordered-float = "4.0"
1214
chrono = "0.4"
15+
itertools = "0.13"
16+
lazy_static = "1.5"
1317

18+
[dev-dependencies]
19+
crossbeam = "0.8"
20+
rand = "0.8"

optd-cost-model/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use optd_persistent::cost_model::interface::{Stat, StatType};
77
pub mod common;
88
pub mod cost;
99
pub mod cost_model;
10+
pub mod stats;
1011
pub mod storage;
1112

1213
pub enum StatValue {
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//! This module provides an encoder that converts alpha-numeric strings
2+
//! into f64 values, designed to maintain the natural ordering of strings.
3+
//!
4+
//! While the encoding is theoretically lossless, in practice, it may suffer
5+
//! from precision loss due to floating-point errors.
6+
//!
7+
//! Non-alpha-numeric characters are relegated to the end of the encoded value,
8+
//! rendering them indistinguishable from one another in this context.
9+
10+
use std::collections::HashMap;
11+
12+
// TODO: Use lazy cell instead of lazy static.
13+
use lazy_static::lazy_static;
14+
15+
// The alphanumerical ordering.
16+
const ALPHANUMERIC_ORDER: [char; 95] = [
17+
' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<',
18+
'=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4',
19+
'5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
20+
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
21+
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
22+
];
23+
24+
const PMF: f64 = 1.0 / (ALPHANUMERIC_ORDER.len() as f64);
25+
26+
lazy_static! {
27+
static ref CDF: HashMap<char, f64> = {
28+
let length = ALPHANUMERIC_ORDER.len() + 1; // To account for non-alpha-numeric characters.
29+
let mut cdf = HashMap::with_capacity(length);
30+
for (index, &char) in ALPHANUMERIC_ORDER.iter().enumerate() {
31+
cdf.insert(char, (index as f64) / (length as f64));
32+
}
33+
cdf
34+
};
35+
}
36+
37+
pub fn encode(string: &str) -> f64 {
38+
let mut left = 0.0;
39+
// 10_000.0 is fairly arbitrary. don't make it f64::MAX though because it causes overflow in
40+
// other places of the code
41+
let mut right = 10_000.0;
42+
43+
for char in string.chars() {
44+
let cdf = CDF.get(&char).unwrap_or(&1.0);
45+
let distance = right - left;
46+
right = left + distance * (cdf + PMF);
47+
left += distance * cdf;
48+
}
49+
50+
left
51+
}
52+
53+
// Start of unit testing section.
54+
#[cfg(test)]
55+
mod tests {
56+
use super::encode;
57+
58+
#[test]
59+
fn encode_tests() {
60+
assert!(encode("") < encode("abc"));
61+
assert!(encode("abc") < encode("bcd"));
62+
63+
assert!(encode("a") < encode("aaa"));
64+
assert!(encode("!a") < encode("a!"));
65+
assert!(encode("Alexis") < encode("Schlomer"));
66+
67+
assert!(encode("Gungnir Rules!") < encode("Schlomer"));
68+
assert!(encode("Gungnir Rules!") < encode("Schlomer"));
69+
70+
assert_eq!(encode(" "), encode(" "));
71+
assert_eq!(encode("Same"), encode("Same"));
72+
assert!(encode("Nicolas ") < encode("Nicolas💰💼"));
73+
}
74+
}

0 commit comments

Comments
 (0)