Skip to content

Commit a0860d4

Browse files
committed
graph: Add utility for handling cumulative histograms
1 parent 229d95e commit a0860d4

File tree

2 files changed

+281
-0
lines changed

2 files changed

+281
-0
lines changed

graph/src/util/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ pub mod error;
1212

1313
pub mod stats;
1414

15+
pub mod ogive;
16+
1517
pub mod cache_weight;
1618

1719
pub mod timed_rw_lock;

graph/src/util/ogive.rs

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
use std::ops::RangeInclusive;
2+
3+
use crate::{constraint_violation, prelude::StoreError};
4+
5+
/// A helper to deal with cumulative histograms, also known as ogives. This
6+
/// implementation is restricted to histograms where each bin has the same
7+
/// size. As a cumulative function of a histogram, an ogive is a piecewise
8+
/// linear function `f` and since it is strictly monotonically increasing,
9+
/// it has an inverse `g`.
10+
///
11+
/// For the given `points`, `f(points[i]) = i * bin_size` and `f` is the
12+
/// piecewise linear interpolant between those points. The inverse `g` is
13+
/// the piecewise linear interpolant of `g(i * bin_size) = points[i]`. Note
14+
/// that that means that `f` divides the y-axis into `points.len()` equal
15+
/// parts.
16+
///
17+
/// The word 'ogive' is somewhat obscure, but has a lot fewer letters than
18+
/// 'piecewise linear function'. Copolit also claims that it is also a lot
19+
/// more fun to say.
20+
pub struct Ogive {
21+
/// The breakpoints of the piecewise linear function
22+
points: Vec<f64>,
23+
/// The size of each bin; the linear piece from `points[i]` to
24+
/// `points[i+1]` rises by this much
25+
bin_size: f64,
26+
/// The range of the ogive, i.e., the minimum and maximum entries from
27+
/// points
28+
range: RangeInclusive<i64>,
29+
}
30+
31+
impl Ogive {
32+
/// Create an ogive from a histogram with breaks at the given points and
33+
/// a total count of `total` entries. As a function, the ogive is 0 at
34+
/// `points[0]` and `total` at `points[points.len() - 1]`.
35+
///
36+
/// The `points` must have at least one entry. The `points` are sorted
37+
/// and deduplicated, i.e., they don't have to be in ascending order.
38+
pub fn from_equi_histogram(mut points: Vec<i64>, total: usize) -> Result<Self, StoreError> {
39+
if points.is_empty() {
40+
return Err(constraint_violation!(
41+
"histogram must have at least one point"
42+
));
43+
}
44+
45+
points.sort_unstable();
46+
points.dedup();
47+
48+
let bins = points.len() - 1;
49+
let bin_size = total as f64 / bins as f64;
50+
let range = points[0]..=points[bins];
51+
let points = points.into_iter().map(|p| p as f64).collect();
52+
Ok(Self {
53+
points,
54+
bin_size,
55+
range,
56+
})
57+
}
58+
59+
pub fn start(&self) -> i64 {
60+
*self.range.start()
61+
}
62+
63+
pub fn end(&self) -> i64 {
64+
*self.range.end()
65+
}
66+
67+
/// Find the next point `next` such that there are `size` entries
68+
/// between `point` and `next`, i.e., such that `f(next) - f(point) =
69+
/// size`.
70+
///
71+
/// It is an error if `point` is smaller than `points[0]`. If `point` is
72+
/// bigger than `points.last()`, that is returned instead.
73+
///
74+
/// The method calculates `g(f(point) + size)`
75+
pub fn next_point(&self, point: i64, size: usize) -> Result<i64, StoreError> {
76+
if point >= *self.range.end() {
77+
return Ok(*self.range.end());
78+
}
79+
// This can only fail if point < self.range.start
80+
self.check_in_range(point)?;
81+
82+
let point_value = self.value(point)?;
83+
let next_value = point_value + size as i64;
84+
let next_point = self.inverse(next_value)?;
85+
Ok(next_point)
86+
}
87+
88+
/// Return the index of the support point immediately preceding `point`.
89+
/// It is an error if `point` is outside the range of points of this
90+
/// ogive; this also implies that the returned index is always strictly
91+
/// less than `self.points.len() - 1`
92+
fn interval_start(&self, point: i64) -> Result<usize, StoreError> {
93+
self.check_in_range(point)?;
94+
95+
let point = point as f64;
96+
let idx = self
97+
.points
98+
.iter()
99+
.position(|&p| point < p)
100+
.unwrap_or(self.points.len() - 1)
101+
- 1;
102+
Ok(idx)
103+
}
104+
105+
/// Return the value of the ogive at `point`, i.e., `f(point)`. It is an
106+
/// error if `point` is outside the range of points of this ogive.
107+
fn value(&self, point: i64) -> Result<i64, StoreError> {
108+
if self.points.len() == 1 {
109+
return Ok(*self.range.end());
110+
}
111+
112+
let idx = self.interval_start(point)?;
113+
let bin_size = self.bin_size as f64;
114+
let (a, b) = (self.points[idx], self.points[idx + 1]);
115+
let point = point as f64;
116+
let value = (idx as f64 + (point - a) / (b - a)) * bin_size;
117+
Ok(value as i64)
118+
}
119+
120+
/// Return the value of the inverse ogive at `value`, i.e., `g(value)`.
121+
/// It is an error if `value` is negative. If `value` is greater than
122+
/// the total count of the ogive, the maximum point of the ogive is
123+
/// returned.
124+
fn inverse(&self, value: i64) -> Result<i64, StoreError> {
125+
let value = value as f64;
126+
if value < 0.0 {
127+
return Err(constraint_violation!("value {} can not be negative", value));
128+
}
129+
let idx = (value / self.bin_size) as usize;
130+
if idx >= self.points.len() - 1 {
131+
return Ok(*self.range.end());
132+
}
133+
let (a, b) = (self.points[idx] as f64, self.points[idx + 1] as f64);
134+
let lambda = (value - idx as f64 * self.bin_size) / self.bin_size;
135+
let x = (1.0 - lambda) * a + lambda * b;
136+
Ok(x as i64)
137+
}
138+
139+
fn check_in_range(&self, point: i64) -> Result<(), StoreError> {
140+
if !self.range.contains(&point) {
141+
return Err(constraint_violation!(
142+
"point {} is outside of the range [{}, {}]",
143+
point,
144+
self.range.start(),
145+
self.range.end(),
146+
));
147+
}
148+
Ok(())
149+
}
150+
}
151+
152+
#[cfg(test)]
153+
mod tests {
154+
use super::*;
155+
156+
#[test]
157+
fn simple() {
158+
// This is just the linear function y = (70 / 5) * (x - 10)
159+
let points: Vec<i64> = vec![10, 20, 30, 40, 50, 60];
160+
let ogive = Ogive::from_equi_histogram(points, 700).unwrap();
161+
162+
// The function represented by `points`
163+
fn f(x: i64) -> i64 {
164+
70 * (x - 10) / 5
165+
}
166+
167+
// The inverse of `f`
168+
fn g(x: i64) -> i64 {
169+
x * 5 / 70 + 10
170+
}
171+
172+
// Check that the ogive is correct
173+
assert_eq!(ogive.bin_size, 700 as f64 / 5 as f64);
174+
assert_eq!(ogive.range, 10..=60);
175+
176+
// Test value method
177+
for point in vec![20, 30, 45, 50, 60] {
178+
assert_eq!(ogive.value(point).unwrap(), f(point), "value for {}", point);
179+
}
180+
181+
// Test next_point method
182+
for step in vec![50, 140, 200] {
183+
for value in vec![10, 20, 30, 35, 45, 50, 60] {
184+
assert_eq!(
185+
ogive.next_point(value, step).unwrap(),
186+
g(f(value) + step as i64).min(60),
187+
"inverse for {} with step {}",
188+
value,
189+
step
190+
);
191+
}
192+
}
193+
194+
// Exceeding the range caps it at the maximum point
195+
assert_eq!(ogive.next_point(50, 140).unwrap(), 60);
196+
assert_eq!(ogive.next_point(50, 500).unwrap(), 60);
197+
198+
// Point to the left of the range should return an error
199+
assert!(ogive.next_point(9, 140).is_err());
200+
// Point to the right of the range gets capped
201+
assert_eq!(ogive.next_point(61, 140).unwrap(), 60);
202+
}
203+
204+
#[test]
205+
fn single_bin() {
206+
// A histogram with only one bin
207+
let points: Vec<i64> = vec![10, 20];
208+
let ogive = Ogive::from_equi_histogram(points, 700).unwrap();
209+
210+
// The function represented by `points`
211+
fn f(x: i64) -> i64 {
212+
700 * (x - 10) / 10
213+
}
214+
215+
// The inverse of `f`
216+
fn g(x: i64) -> i64 {
217+
x * 10 / 700 + 10
218+
}
219+
220+
// Check that the ogive is correct
221+
assert_eq!(ogive.bin_size, 700 as f64 / 1 as f64);
222+
assert_eq!(ogive.range, 10..=20);
223+
224+
// Test value method
225+
for point in vec![10, 15, 20] {
226+
assert_eq!(ogive.value(point).unwrap(), f(point), "value for {}", point);
227+
}
228+
229+
// Test next_point method
230+
for step in vec![50, 140, 200] {
231+
for value in vec![10, 15, 20] {
232+
assert_eq!(
233+
ogive.next_point(value, step).unwrap(),
234+
g(f(value) + step as i64).min(20),
235+
"inverse for {} with step {}",
236+
value,
237+
step
238+
);
239+
}
240+
}
241+
242+
// Exceeding the range caps it at the maximum point
243+
assert_eq!(ogive.next_point(20, 140).unwrap(), 20);
244+
assert_eq!(ogive.next_point(20, 500).unwrap(), 20);
245+
246+
// Point to the left of the range should return an error
247+
assert!(ogive.next_point(9, 140).is_err());
248+
// Point to the right of the range gets capped
249+
assert_eq!(ogive.next_point(21, 140).unwrap(), 20);
250+
}
251+
252+
#[test]
253+
fn one_bin() {
254+
let points: Vec<i64> = vec![10];
255+
let ogive = Ogive::from_equi_histogram(points, 700).unwrap();
256+
257+
assert_eq!(ogive.next_point(10, 1).unwrap(), 10);
258+
assert_eq!(ogive.next_point(10, 4).unwrap(), 10);
259+
assert_eq!(ogive.next_point(15, 1).unwrap(), 10);
260+
261+
assert!(ogive.next_point(9, 1).is_err());
262+
}
263+
264+
#[test]
265+
fn exponential() {
266+
let points: Vec<i64> = vec![32, 48, 56, 60, 62, 64];
267+
let ogive = Ogive::from_equi_histogram(points, 100).unwrap();
268+
269+
assert_eq!(ogive.value(50).unwrap(), 25);
270+
assert_eq!(ogive.value(56).unwrap(), 40);
271+
assert_eq!(ogive.value(58).unwrap(), 50);
272+
assert_eq!(ogive.value(63).unwrap(), 90);
273+
274+
assert_eq!(ogive.next_point(32, 40).unwrap(), 56);
275+
assert_eq!(ogive.next_point(50, 10).unwrap(), 54);
276+
assert_eq!(ogive.next_point(50, 50).unwrap(), 61);
277+
assert_eq!(ogive.next_point(40, 40).unwrap(), 58);
278+
}
279+
}

0 commit comments

Comments
 (0)