KillingSpark
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 12 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎Readme.md‎
Lines changed: 32 additions & 5 deletions b/‎Readme.md‎
Lines changed: 32 additions & 5 deletions
diff --git a/‎src/bin/zstd.rs‎
Lines changed: 2 additions & 1 deletion b/‎src/bin/zstd.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/bin/zstd_dict.rs‎
Lines changed: 24 additions & 0 deletions b/‎src/bin/zstd_dict.rs‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/bit_io/bit_reader.rs‎
Lines changed: 3 additions & 3 deletions b/‎src/bit_io/bit_reader.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/bit_io/bit_writer.rs‎
Lines changed: 5 additions & 5 deletions b/‎src/bit_io/bit_writer.rs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/dictionary/cover.rs‎
Lines changed: 134 additions & 0 deletions b/‎src/dictionary/cover.rs‎
Lines changed: 134 additions & 0 deletions
@@ -2,6 +2,7 @@
 **/*.rs.bk
 Cargo.lock
 /local_corpus_files
+/local_dict_corpus_files
 /orig-zstd
 fuzz_decodecorpus
 perf.data*
 
@@ -12,6 +12,10 @@ readme = "Readme.md"
 keywords = ["zstd", "zstandard", "decompression"]
 categories = ["compression"]
 
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
+
 [dependencies]
 twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }
 
@@ -20,17 +24,20 @@ twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"]
 compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
 alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
+fastrand = "2.3.0"
+
 
 [dev-dependencies]
 criterion = "0.5"
 rand = { version = "0.8.5", features = ["small_rng"] }
-zstd = "0.13.2"
+zstd =  { version = "0.13.2", features = ["zstdmt"]}
 
 [features]
 default = ["hash", "std"]
 hash = ["dep:twox-hash"]
 fuzz_exports = []
 std = []
+dict_builder = ["std"]
 
 # Internal feature, only used when building as part of libstd, not part of the
 # stable interface of this crate.
@@ -47,3 +54,7 @@ required-features = ["std"]
 [[bin]]
 name = "zstd_stream"
 required-features = ["std"]
+
+[[bin]]
+name = "zstd_dict"
+required-features = ["std", "dict_builder"]
@@ -15,22 +15,49 @@ This crate is currently actively maintained.
 
 # Current Status
 
-Feature complete on the decoder side.
+## Decompression
+The `decoding` module provides a complete
+implementation of a Zstandard decompressor.
+
+In terms of speed, `ruzstd` is behind the original C implementation
+which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
+
+Measuring with the 'time' utility the original zstd and my decoder both
+decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5
+times slower. Enwik9 is highly compressible, for less compressible data
+(like a ubuntu installation .iso) my decoder comes close to only being
+1.4 times slower.
 
+## Compression
 On the compression side:
 - Support for generating compressed blocks at any compression level
   - [x] Uncompressed
   - [x] Fastest (roughly level 1)
   - [ ] Default (roughly level 3)
   - [ ] Better (roughly level 7)
   - [ ] Best (roughly level 11)
-- [ ] Checksums
+- [x] Checksums
 - [ ] Dictionaries
 
-## Speed
-In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
+## Dictionary Generation
+When the `dict_builder` feature is enabled, the `dictionary` module
+provides the ability to create new dictionaries. 
+
+On the `github-users` sample set, our implementation benchmarks within
+0.2% of the official implementation (as of commit 
+`09e52d07340acdb2e13817b066e8be6e424f7258`):
+```no_build
+uncompressed: 100.00% (7484607 bytes)
+no dict: 34.99% of original size (2618872 bytes)
+reference dict: 16.16% of no dict size (2195672 bytes smaller)
+our dict: 16.28% of no dict size (2192400 bytes smaller)
+```
+
+The dictionary generator only provides support for creating "raw
+content" dictionaries. Tagged dictionaries are currently unsupported.
 
-Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower.
+See <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
+for clarification.
 
 
 # How can you use it?
 
@@ -21,7 +21,7 @@ struct StateTracker {
     file_size: u64,
     old_percentage: i8,
 }
-
+#[allow(unused)]
 fn decompress(flags: &[String], file_paths: &[String]) {
     if !flags.contains(&"-d".to_owned()) {
         eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
@@ -128,6 +128,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
     }
 }
 
+#[allow(unused)]
 struct PercentPrintReader<R: Read> {
     total: usize,
     counter: usize,
 
@@ -0,0 +1,24 @@
+use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source};
+use std::env::args;
+use std::fs::File;
+use std::path::Path;
+
+fn main() {
+    let args: Vec<String> = args().collect();
+    let input_path: &Path = args.get(1).expect("no input provided").as_ref();
+    let output_path: &Path = args.get(2).expect("no output path provided").as_ref();
+    let dict_size = args
+        .get(3)
+        .expect("no dict size provided (kb)")
+        .parse::<usize>()
+        .expect("dict size was not a valid num");
+
+    let mut output = File::create(output_path).unwrap();
+    if input_path.is_file() {
+        let source = File::open(input_path).expect("unable to open input path");
+        let source_size = source.metadata().unwrap().len();
+        create_raw_dict_from_source(source, source_size as usize, &mut output, dict_size);
+    } else {
+        create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap();
+    }
+}
@@ -66,7 +66,7 @@ impl<'s> BitReader<'s> {
 
             let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value
 
-            assert!(self.idx % 8 == 0);
+            assert!(self.idx.is_multiple_of(8));
 
             //collect full bytes
             for _ in 0..full_bytes_needed {
@@ -116,7 +116,7 @@ impl core::fmt::Display for GetBitsError {
             } => {
                 write!(
                     f,
-                    "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits",
+                    "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits"
                 )
             }
             GetBitsError::NotEnoughRemainingBits {
@@ -125,7 +125,7 @@ impl core::fmt::Display for GetBitsError {
             } => {
                 write!(
                     f,
-                    "Can\'t read {requested} bits, only have {remaining} bits left",
+                    "Can\'t read {requested} bits, only have {remaining} bits left"
                 )
             }
         }
 
@@ -45,7 +45,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
 
     /// Reset to an index. Currently only supports resetting to a byte aligned index
     pub fn reset_to(&mut self, index: usize) {
-        assert!(index % 8 == 0);
+        assert!(index.is_multiple_of(8));
         self.partial = 0;
         self.bits_in_partial = 0;
         self.bit_idx = index;
@@ -66,7 +66,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
 
         // We might be changing bits unaligned to byte borders.
         // This means the lower bits of the first byte we are touching must stay the same
-        if idx % 8 != 0 {
+        if !idx.is_multiple_of(8) {
             // How many (upper) bits will change in the first byte?
             let bits_in_first_byte = 8 - (idx % 8);
             // We don't support only changing a few bits in the middle of a byte
@@ -82,7 +82,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
             idx += bits_in_first_byte;
         }
 
-        assert!(idx % 8 == 0);
+        assert!(idx.is_multiple_of(8));
         // We are now byte aligned, change idx to byte resolution
         let mut idx = idx / 8;
 
@@ -113,7 +113,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
 
     /// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
     pub fn flush(&mut self) {
-        assert!(self.bits_in_partial % 8 == 0);
+        assert!(self.bits_in_partial.is_multiple_of(8));
         let full_bytes = self.bits_in_partial / 8;
         self.output
             .as_mut()
@@ -204,7 +204,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
     /// Returns how many bits are missing for an even byte
     pub fn misaligned(&self) -> usize {
         let idx = self.index();
-        if idx % 8 == 0 {
+        if idx.is_multiple_of(8) {
             0
         } else {
             8 - (idx % 8)
 
@@ -0,0 +1,134 @@
+//! An implementation of the local maximum coverage algorithm
+//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
+//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
+//!
+//! See: <https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf>
+//!
+//! Facebook's implementation was also used as a reference.
+//! <https://github.com/facebook/zstd/tree/dev/lib/dictBuilder>
+
+use super::DictParams;
+use crate::dictionary::frequency::estimate_frequency;
+use core::convert::TryInto;
+use std::collections::HashMap;
+use std::vec::Vec;
+
+/// The size of each k-mer
+pub(super) const K: usize = 16;
+
+///As found under "4: Experiments - Varying k-mer Size" in the original paper,
+/// "when k = 16, across all our text collections, there is a reasonable spread"
+///
+/// Reasonable range: [6, 16]
+pub(super) type KMer = [u8; K];
+
+pub struct Segment {
+    /// The actual contents of the segment.
+    pub raw: Vec<u8>,
+    /// A measure of how "ideal" a given segment would be to include in the dictionary
+    ///
+    /// Higher is better, there's no upper limit. This number is determined by
+    /// estimating the number of occurances in a given epoch
+    pub score: usize,
+}
+
+impl Eq for Segment {}
+
+impl PartialEq for Segment {
+    fn eq(&self, other: &Self) -> bool {
+        // We only really care about score in regards to heap order
+        self.score == other.score
+    }
+}
+
+impl PartialOrd for Segment {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Segment {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.score.cmp(&other.score)
+    }
+}
+
+/// A re-usable allocation containing large allocations
+/// that are used multiple times during dictionary construction (once per epoch)
+pub struct Context {
+    /// Keeps track of the number of occurances of a particular k-mer within an epoch.
+    ///
+    /// Reset for each epoch.
+    pub frequencies: HashMap<KMer, usize>,
+}
+
+/// Returns the highest scoring segment in an epoch
+/// as a slice of that epoch.
+pub fn pick_best_segment(
+    params: &DictParams,
+    ctx: &mut Context,
+    collection_sample: &'_ [u8],
+) -> Segment {
+    let mut segments = collection_sample
+        .chunks(params.segment_size as usize)
+        .peekable();
+    let mut best_segment: &[u8] = segments.peek().expect("at least one segment");
+    let mut top_segment_score: usize = 0;
+    // Iterate over segments and score each segment, keeping track of the best segment
+    for segment in segments {
+        let segment_score = score_segment(ctx, collection_sample, segment);
+        if segment_score > top_segment_score {
+            best_segment = segment;
+            top_segment_score = segment_score;
+        }
+    }
+
+    Segment {
+        raw: best_segment.into(),
+        score: top_segment_score,
+    }
+}
+
+/// Given a segment, compute the score (or usefulness) of that segment against the entire epoch.
+///
+/// `score_segment` modifies `ctx.frequencies`.
+fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize {
+    let mut segment_score = 0;
+    // Determine the score of each overlapping k-mer
+    for i in 0..(segment.len() - K - 1) {
+        let kmer: &KMer = (&segment[i..i + K])
+            .try_into()
+            .expect("Failed to make kmer");
+        // if the kmer is already in the pool, it recieves a score of zero
+        if ctx.frequencies.contains_key(kmer) {
+            continue;
+        }
+        let kmer_score = estimate_frequency(kmer, collection_sample);
+        ctx.frequencies.insert(*kmer, kmer_score);
+        segment_score += kmer_score;
+    }
+
+    segment_score
+}
+
+/// Computes the number of epochs and the size of each epoch.
+///
+/// Returns a (number of epochs, epoch size) tuple.
+///
+/// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd.
+pub fn compute_epoch_info(
+    params: &DictParams,
+    max_dict_size: usize,
+    num_kmers: usize,
+) -> (usize, usize) {
+    let min_epoch_size = 10_000; // 10 KiB
+    let mut num_epochs: usize = usize::max(1, max_dict_size / params.segment_size as usize);
+    let mut epoch_size: usize = num_kmers / num_epochs;
+    if epoch_size >= min_epoch_size {
+        assert!(epoch_size * num_epochs <= num_kmers);
+        return (num_epochs, epoch_size);
+    }
+    epoch_size = usize::min(min_epoch_size, num_kmers);
+    num_epochs = num_kmers / epoch_size;
+    (num_epochs, epoch_size)
+}
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ struct StateTracker {`
`21`	`21`	`file_size: u64,`
`22`	`22`	`old_percentage: i8,`
`23`	`23`	`}`
`24`		`-`
	`24`	`+#[allow(unused)]`
`25`	`25`	`fn decompress(flags: &[String], file_paths: &[String]) {`
`26`	`26`	`if !flags.contains(&"-d".to_owned()) {`
`27`	`27`	`eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");`
`@@ -128,6 +128,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {`
`128`	`128`	`}`
`129`	`129`	`}`
`130`	`130`
	`131`	`+#[allow(unused)]`
`131`	`132`	`struct PercentPrintReader<R: Read> {`
`132`	`133`	`total: usize,`
`133`	`134`	`counter: usize,`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ impl<'s> BitReader<'s> {`
`66`	`66`
`67`	`67`	`let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value`
`68`	`68`
`69`		`- assert!(self.idx % 8 == 0);`
	`69`	`+ assert!(self.idx.is_multiple_of(8));`
`70`	`70`
`71`	`71`	`//collect full bytes`
`72`	`72`	`for _ in 0..full_bytes_needed {`
`@@ -116,7 +116,7 @@ impl core::fmt::Display for GetBitsError {`
`116`	`116`	`} => {`
`117`	`117`	`write!(`
`118`	`118`	`f,`
`119`		`- "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits",`
	`119`	`+ "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits"`
`120`	`120`	`)`
`121`	`121`	`}`
`122`	`122`	`GetBitsError::NotEnoughRemainingBits {`
`@@ -125,7 +125,7 @@ impl core::fmt::Display for GetBitsError {`
`125`	`125`	`} => {`
`126`	`126`	`write!(`
`127`	`127`	`f,`
`128`		`- "Can\'t read {requested} bits, only have {remaining} bits left",`
	`128`	`+ "Can\'t read {requested} bits, only have {remaining} bits left"`
`129`	`129`	`)`
`130`	`130`	`}`
`131`	`131`	`}`