|
| 1 | +//! An implementation of the local maximum coverage algorithm |
| 2 | +//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries", |
| 3 | +//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne. |
| 4 | +//! |
| 5 | +//! See: <https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf> |
| 6 | +//! |
| 7 | +//! Facebook's implementation was also used as a reference. |
| 8 | +//! <https://github.com/facebook/zstd/tree/dev/lib/dictBuilder> |
| 9 | +
|
| 10 | +use super::DictParams; |
| 11 | +use crate::dictionary::frequency::estimate_frequency; |
| 12 | +use core::convert::TryInto; |
| 13 | +use std::collections::HashMap; |
| 14 | +use std::vec::Vec; |
| 15 | + |
| 16 | +/// The size of each k-mer |
| 17 | +pub(super) const K: usize = 16; |
| 18 | + |
| 19 | +///As found under "4: Experiments - Varying k-mer Size" in the original paper, |
| 20 | +/// "when k = 16, across all our text collections, there is a reasonable spread" |
| 21 | +/// |
| 22 | +/// Reasonable range: [6, 16] |
| 23 | +pub(super) type KMer = [u8; K]; |
| 24 | + |
| 25 | +pub struct Segment { |
| 26 | + /// The actual contents of the segment. |
| 27 | + pub raw: Vec<u8>, |
| 28 | + /// A measure of how "ideal" a given segment would be to include in the dictionary |
| 29 | + /// |
| 30 | + /// Higher is better, there's no upper limit. This number is determined by |
| 31 | + /// estimating the number of occurances in a given epoch |
| 32 | + pub score: usize, |
| 33 | +} |
| 34 | + |
| 35 | +impl Eq for Segment {} |
| 36 | + |
| 37 | +impl PartialEq for Segment { |
| 38 | + fn eq(&self, other: &Self) -> bool { |
| 39 | + // We only really care about score in regards to heap order |
| 40 | + self.score == other.score |
| 41 | + } |
| 42 | +} |
| 43 | + |
| 44 | +impl PartialOrd for Segment { |
| 45 | + fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> { |
| 46 | + Some(self.cmp(other)) |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +impl Ord for Segment { |
| 51 | + fn cmp(&self, other: &Self) -> core::cmp::Ordering { |
| 52 | + self.score.cmp(&other.score) |
| 53 | + } |
| 54 | +} |
| 55 | + |
| 56 | +/// A re-usable allocation containing large allocations |
| 57 | +/// that are used multiple times during dictionary construction (once per epoch) |
| 58 | +pub struct Context { |
| 59 | + /// Keeps track of the number of occurances of a particular k-mer within an epoch. |
| 60 | + /// |
| 61 | + /// Reset for each epoch. |
| 62 | + pub frequencies: HashMap<KMer, usize>, |
| 63 | +} |
| 64 | + |
| 65 | +/// Returns the highest scoring segment in an epoch |
| 66 | +/// as a slice of that epoch. |
| 67 | +pub fn pick_best_segment( |
| 68 | + params: &DictParams, |
| 69 | + ctx: &mut Context, |
| 70 | + collection_sample: &'_ [u8], |
| 71 | +) -> Segment { |
| 72 | + let mut segments = collection_sample |
| 73 | + .chunks(params.segment_size as usize) |
| 74 | + .peekable(); |
| 75 | + let mut best_segment: &[u8] = segments.peek().expect("at least one segment"); |
| 76 | + let mut top_segment_score: usize = 0; |
| 77 | + // Iterate over segments and score each segment, keeping track of the best segment |
| 78 | + for segment in segments { |
| 79 | + let segment_score = score_segment(ctx, collection_sample, segment); |
| 80 | + if segment_score > top_segment_score { |
| 81 | + best_segment = segment; |
| 82 | + top_segment_score = segment_score; |
| 83 | + } |
| 84 | + } |
| 85 | + |
| 86 | + Segment { |
| 87 | + raw: best_segment.into(), |
| 88 | + score: top_segment_score, |
| 89 | + } |
| 90 | +} |
| 91 | + |
| 92 | +/// Given a segment, compute the score (or usefulness) of that segment against the entire epoch. |
| 93 | +/// |
| 94 | +/// `score_segment` modifies `ctx.frequencies`. |
| 95 | +fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { |
| 96 | + let mut segment_score = 0; |
| 97 | + // Determine the score of each overlapping k-mer |
| 98 | + for i in 0..(segment.len() - K - 1) { |
| 99 | + let kmer: &KMer = (&segment[i..i + K]) |
| 100 | + .try_into() |
| 101 | + .expect("Failed to make kmer"); |
| 102 | + // if the kmer is already in the pool, it recieves a score of zero |
| 103 | + if ctx.frequencies.contains_key(kmer) { |
| 104 | + continue; |
| 105 | + } |
| 106 | + let kmer_score = estimate_frequency(kmer, collection_sample); |
| 107 | + ctx.frequencies.insert(*kmer, kmer_score); |
| 108 | + segment_score += kmer_score; |
| 109 | + } |
| 110 | + |
| 111 | + segment_score |
| 112 | +} |
| 113 | + |
| 114 | +/// Computes the number of epochs and the size of each epoch. |
| 115 | +/// |
| 116 | +/// Returns a (number of epochs, epoch size) tuple. |
| 117 | +/// |
| 118 | +/// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd. |
| 119 | +pub fn compute_epoch_info( |
| 120 | + params: &DictParams, |
| 121 | + max_dict_size: usize, |
| 122 | + num_kmers: usize, |
| 123 | +) -> (usize, usize) { |
| 124 | + let min_epoch_size = 10_000; // 10 KiB |
| 125 | + let mut num_epochs: usize = usize::max(1, max_dict_size / params.segment_size as usize); |
| 126 | + let mut epoch_size: usize = num_kmers / num_epochs; |
| 127 | + if epoch_size >= min_epoch_size { |
| 128 | + assert!(epoch_size * num_epochs <= num_kmers); |
| 129 | + return (num_epochs, epoch_size); |
| 130 | + } |
| 131 | + epoch_size = usize::min(min_epoch_size, num_kmers); |
| 132 | + num_epochs = num_kmers / epoch_size; |
| 133 | + (num_epochs, epoch_size) |
| 134 | +} |
0 commit comments