|
| 1 | +use std::ops::RangeInclusive; |
| 2 | + |
| 3 | +use crate::{constraint_violation, prelude::StoreError}; |
| 4 | + |
| 5 | +/// A helper to deal with cumulative histograms, also known as ogives. This |
| 6 | +/// implementation is restricted to histograms where each bin has the same |
| 7 | +/// size. As a cumulative function of a histogram, an ogive is a piecewise |
| 8 | +/// linear function `f` and since it is strictly monotonically increasing, |
| 9 | +/// it has an inverse `g`. |
| 10 | +/// |
| 11 | +/// For the given `points`, `f(points[i]) = i * bin_size` and `f` is the |
| 12 | +/// piecewise linear interpolant between those points. The inverse `g` is |
| 13 | +/// the piecewise linear interpolant of `g(i * bin_size) = points[i]`. Note |
| 14 | +/// that that means that `f` divides the y-axis into `points.len()` equal |
| 15 | +/// parts. |
| 16 | +/// |
| 17 | +/// The word 'ogive' is somewhat obscure, but has a lot fewer letters than |
| 18 | +/// 'piecewise linear function'. Copolit also claims that it is also a lot |
| 19 | +/// more fun to say. |
| 20 | +pub struct Ogive { |
| 21 | + /// The breakpoints of the piecewise linear function |
| 22 | + points: Vec<f64>, |
| 23 | + /// The size of each bin; the linear piece from `points[i]` to |
| 24 | + /// `points[i+1]` rises by this much |
| 25 | + bin_size: f64, |
| 26 | + /// The range of the ogive, i.e., the minimum and maximum entries from |
| 27 | + /// points |
| 28 | + range: RangeInclusive<i64>, |
| 29 | +} |
| 30 | + |
| 31 | +impl Ogive { |
| 32 | + /// Create an ogive from a histogram with breaks at the given points and |
| 33 | + /// a total count of `total` entries. As a function, the ogive is 0 at |
| 34 | + /// `points[0]` and `total` at `points[points.len() - 1]`. |
| 35 | + /// |
| 36 | + /// The `points` must have at least one entry. The `points` are sorted |
| 37 | + /// and deduplicated, i.e., they don't have to be in ascending order. |
| 38 | + pub fn from_equi_histogram(mut points: Vec<i64>, total: usize) -> Result<Self, StoreError> { |
| 39 | + if points.is_empty() { |
| 40 | + return Err(constraint_violation!( |
| 41 | + "histogram must have at least one point" |
| 42 | + )); |
| 43 | + } |
| 44 | + |
| 45 | + points.sort_unstable(); |
| 46 | + points.dedup(); |
| 47 | + |
| 48 | + let bins = points.len() - 1; |
| 49 | + let bin_size = total as f64 / bins as f64; |
| 50 | + let range = points[0]..=points[bins]; |
| 51 | + let points = points.into_iter().map(|p| p as f64).collect(); |
| 52 | + Ok(Self { |
| 53 | + points, |
| 54 | + bin_size, |
| 55 | + range, |
| 56 | + }) |
| 57 | + } |
| 58 | + |
| 59 | + pub fn start(&self) -> i64 { |
| 60 | + *self.range.start() |
| 61 | + } |
| 62 | + |
| 63 | + pub fn end(&self) -> i64 { |
| 64 | + *self.range.end() |
| 65 | + } |
| 66 | + |
| 67 | + /// Find the next point `next` such that there are `size` entries |
| 68 | + /// between `point` and `next`, i.e., such that `f(next) - f(point) = |
| 69 | + /// size`. |
| 70 | + /// |
| 71 | + /// It is an error if `point` is smaller than `points[0]`. If `point` is |
| 72 | + /// bigger than `points.last()`, that is returned instead. |
| 73 | + /// |
| 74 | + /// The method calculates `g(f(point) + size)` |
| 75 | + pub fn next_point(&self, point: i64, size: usize) -> Result<i64, StoreError> { |
| 76 | + if point >= *self.range.end() { |
| 77 | + return Ok(*self.range.end()); |
| 78 | + } |
| 79 | + // This can only fail if point < self.range.start |
| 80 | + self.check_in_range(point)?; |
| 81 | + |
| 82 | + let point_value = self.value(point)?; |
| 83 | + let next_value = point_value + size as i64; |
| 84 | + let next_point = self.inverse(next_value)?; |
| 85 | + Ok(next_point) |
| 86 | + } |
| 87 | + |
| 88 | + /// Return the index of the support point immediately preceding `point`. |
| 89 | + /// It is an error if `point` is outside the range of points of this |
| 90 | + /// ogive; this also implies that the returned index is always strictly |
| 91 | + /// less than `self.points.len() - 1` |
| 92 | + fn interval_start(&self, point: i64) -> Result<usize, StoreError> { |
| 93 | + self.check_in_range(point)?; |
| 94 | + |
| 95 | + let point = point as f64; |
| 96 | + let idx = self |
| 97 | + .points |
| 98 | + .iter() |
| 99 | + .position(|&p| point < p) |
| 100 | + .unwrap_or(self.points.len() - 1) |
| 101 | + - 1; |
| 102 | + Ok(idx) |
| 103 | + } |
| 104 | + |
| 105 | + /// Return the value of the ogive at `point`, i.e., `f(point)`. It is an |
| 106 | + /// error if `point` is outside the range of points of this ogive. |
| 107 | + fn value(&self, point: i64) -> Result<i64, StoreError> { |
| 108 | + if self.points.len() == 1 { |
| 109 | + return Ok(*self.range.end()); |
| 110 | + } |
| 111 | + |
| 112 | + let idx = self.interval_start(point)?; |
| 113 | + let bin_size = self.bin_size as f64; |
| 114 | + let (a, b) = (self.points[idx], self.points[idx + 1]); |
| 115 | + let point = point as f64; |
| 116 | + let value = (idx as f64 + (point - a) / (b - a)) * bin_size; |
| 117 | + Ok(value as i64) |
| 118 | + } |
| 119 | + |
| 120 | + /// Return the value of the inverse ogive at `value`, i.e., `g(value)`. |
| 121 | + /// It is an error if `value` is negative. If `value` is greater than |
| 122 | + /// the total count of the ogive, the maximum point of the ogive is |
| 123 | + /// returned. |
| 124 | + fn inverse(&self, value: i64) -> Result<i64, StoreError> { |
| 125 | + let value = value as f64; |
| 126 | + if value < 0.0 { |
| 127 | + return Err(constraint_violation!("value {} can not be negative", value)); |
| 128 | + } |
| 129 | + let idx = (value / self.bin_size) as usize; |
| 130 | + if idx >= self.points.len() - 1 { |
| 131 | + return Ok(*self.range.end()); |
| 132 | + } |
| 133 | + let (a, b) = (self.points[idx] as f64, self.points[idx + 1] as f64); |
| 134 | + let lambda = (value - idx as f64 * self.bin_size) / self.bin_size; |
| 135 | + let x = (1.0 - lambda) * a + lambda * b; |
| 136 | + Ok(x as i64) |
| 137 | + } |
| 138 | + |
| 139 | + fn check_in_range(&self, point: i64) -> Result<(), StoreError> { |
| 140 | + if !self.range.contains(&point) { |
| 141 | + return Err(constraint_violation!( |
| 142 | + "point {} is outside of the range [{}, {}]", |
| 143 | + point, |
| 144 | + self.range.start(), |
| 145 | + self.range.end(), |
| 146 | + )); |
| 147 | + } |
| 148 | + Ok(()) |
| 149 | + } |
| 150 | +} |
| 151 | + |
| 152 | +#[cfg(test)] |
| 153 | +mod tests { |
| 154 | + use super::*; |
| 155 | + |
| 156 | + #[test] |
| 157 | + fn simple() { |
| 158 | + // This is just the linear function y = (70 / 5) * (x - 10) |
| 159 | + let points: Vec<i64> = vec![10, 20, 30, 40, 50, 60]; |
| 160 | + let ogive = Ogive::from_equi_histogram(points, 700).unwrap(); |
| 161 | + |
| 162 | + // The function represented by `points` |
| 163 | + fn f(x: i64) -> i64 { |
| 164 | + 70 * (x - 10) / 5 |
| 165 | + } |
| 166 | + |
| 167 | + // The inverse of `f` |
| 168 | + fn g(x: i64) -> i64 { |
| 169 | + x * 5 / 70 + 10 |
| 170 | + } |
| 171 | + |
| 172 | + // Check that the ogive is correct |
| 173 | + assert_eq!(ogive.bin_size, 700 as f64 / 5 as f64); |
| 174 | + assert_eq!(ogive.range, 10..=60); |
| 175 | + |
| 176 | + // Test value method |
| 177 | + for point in vec![20, 30, 45, 50, 60] { |
| 178 | + assert_eq!(ogive.value(point).unwrap(), f(point), "value for {}", point); |
| 179 | + } |
| 180 | + |
| 181 | + // Test next_point method |
| 182 | + for step in vec![50, 140, 200] { |
| 183 | + for value in vec![10, 20, 30, 35, 45, 50, 60] { |
| 184 | + assert_eq!( |
| 185 | + ogive.next_point(value, step).unwrap(), |
| 186 | + g(f(value) + step as i64).min(60), |
| 187 | + "inverse for {} with step {}", |
| 188 | + value, |
| 189 | + step |
| 190 | + ); |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + // Exceeding the range caps it at the maximum point |
| 195 | + assert_eq!(ogive.next_point(50, 140).unwrap(), 60); |
| 196 | + assert_eq!(ogive.next_point(50, 500).unwrap(), 60); |
| 197 | + |
| 198 | + // Point to the left of the range should return an error |
| 199 | + assert!(ogive.next_point(9, 140).is_err()); |
| 200 | + // Point to the right of the range gets capped |
| 201 | + assert_eq!(ogive.next_point(61, 140).unwrap(), 60); |
| 202 | + } |
| 203 | + |
| 204 | + #[test] |
| 205 | + fn single_bin() { |
| 206 | + // A histogram with only one bin |
| 207 | + let points: Vec<i64> = vec![10, 20]; |
| 208 | + let ogive = Ogive::from_equi_histogram(points, 700).unwrap(); |
| 209 | + |
| 210 | + // The function represented by `points` |
| 211 | + fn f(x: i64) -> i64 { |
| 212 | + 700 * (x - 10) / 10 |
| 213 | + } |
| 214 | + |
| 215 | + // The inverse of `f` |
| 216 | + fn g(x: i64) -> i64 { |
| 217 | + x * 10 / 700 + 10 |
| 218 | + } |
| 219 | + |
| 220 | + // Check that the ogive is correct |
| 221 | + assert_eq!(ogive.bin_size, 700 as f64 / 1 as f64); |
| 222 | + assert_eq!(ogive.range, 10..=20); |
| 223 | + |
| 224 | + // Test value method |
| 225 | + for point in vec![10, 15, 20] { |
| 226 | + assert_eq!(ogive.value(point).unwrap(), f(point), "value for {}", point); |
| 227 | + } |
| 228 | + |
| 229 | + // Test next_point method |
| 230 | + for step in vec![50, 140, 200] { |
| 231 | + for value in vec![10, 15, 20] { |
| 232 | + assert_eq!( |
| 233 | + ogive.next_point(value, step).unwrap(), |
| 234 | + g(f(value) + step as i64).min(20), |
| 235 | + "inverse for {} with step {}", |
| 236 | + value, |
| 237 | + step |
| 238 | + ); |
| 239 | + } |
| 240 | + } |
| 241 | + |
| 242 | + // Exceeding the range caps it at the maximum point |
| 243 | + assert_eq!(ogive.next_point(20, 140).unwrap(), 20); |
| 244 | + assert_eq!(ogive.next_point(20, 500).unwrap(), 20); |
| 245 | + |
| 246 | + // Point to the left of the range should return an error |
| 247 | + assert!(ogive.next_point(9, 140).is_err()); |
| 248 | + // Point to the right of the range gets capped |
| 249 | + assert_eq!(ogive.next_point(21, 140).unwrap(), 20); |
| 250 | + } |
| 251 | + |
| 252 | + #[test] |
| 253 | + fn one_bin() { |
| 254 | + let points: Vec<i64> = vec![10]; |
| 255 | + let ogive = Ogive::from_equi_histogram(points, 700).unwrap(); |
| 256 | + |
| 257 | + assert_eq!(ogive.next_point(10, 1).unwrap(), 10); |
| 258 | + assert_eq!(ogive.next_point(10, 4).unwrap(), 10); |
| 259 | + assert_eq!(ogive.next_point(15, 1).unwrap(), 10); |
| 260 | + |
| 261 | + assert!(ogive.next_point(9, 1).is_err()); |
| 262 | + } |
| 263 | + |
| 264 | + #[test] |
| 265 | + fn exponential() { |
| 266 | + let points: Vec<i64> = vec![32, 48, 56, 60, 62, 64]; |
| 267 | + let ogive = Ogive::from_equi_histogram(points, 100).unwrap(); |
| 268 | + |
| 269 | + assert_eq!(ogive.value(50).unwrap(), 25); |
| 270 | + assert_eq!(ogive.value(56).unwrap(), 40); |
| 271 | + assert_eq!(ogive.value(58).unwrap(), 50); |
| 272 | + assert_eq!(ogive.value(63).unwrap(), 90); |
| 273 | + |
| 274 | + assert_eq!(ogive.next_point(32, 40).unwrap(), 56); |
| 275 | + assert_eq!(ogive.next_point(50, 10).unwrap(), 54); |
| 276 | + assert_eq!(ogive.next_point(50, 50).unwrap(), 61); |
| 277 | + assert_eq!(ogive.next_point(40, 40).unwrap(), 58); |
| 278 | + } |
| 279 | +} |
0 commit comments