diff --git a/Cargo.toml b/Cargo.toml index 7547f1b..312f46d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,6 @@ members = [ "crates/*", "crates/bpe/benchmarks", "crates/bpe/tests", - "crates/hash-sorted-map/benchmarks", ] resolver = "2" diff --git a/Makefile b/Makefile index 1a19adb..742e077 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ build: .PHONY: build-js build-js: + which wasm-pack || cargo install wasm-pack npm --prefix crates/string-offsets/js install npm --prefix crates/string-offsets/js run compile diff --git a/README.md b/README.md index ae3acce..0dbe85f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ A collection of useful algorithms written in Rust. Currently contains: - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters. - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents. - [`bpe-openai`](crates/bpe-openai): Fast tokenizers for OpenAI token sets based on the `bpe` crate. +- [`sparse-ngrams`](crates/sparse-ngrams): fast sparse n-gram extraction from byte slices. Selects variable-length n-grams (2–8 bytes) deterministically using bigram frequency priorities, suitable for substring search indexes. - [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries. ## Background diff --git a/crates/sparse-ngrams/Cargo.toml b/crates/sparse-ngrams/Cargo.toml new file mode 100644 index 0000000..80ca0f7 --- /dev/null +++ b/crates/sparse-ngrams/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "sparse-ngrams" +version = "0.1.0" +edition = "2021" +description = "Fast sparse n-gram extraction from byte slices." +repository = "https://github.com/github/rust-gems" +license = "MIT" +keywords = ["ngram", "algorithm", "search", "index"] +categories = ["algorithms", "data-structures", "text-processing"] + +[lib] +bench = false + +[[bench]] +name = "performance" +path = "benchmarks/performance.rs" +harness = false + +[dev-dependencies] +criterion = "0.7" diff --git a/crates/sparse-ngrams/README.md b/crates/sparse-ngrams/README.md new file mode 100644 index 0000000..1772382 --- /dev/null +++ b/crates/sparse-ngrams/README.md @@ -0,0 +1,88 @@ +# sparse-ngrams + +Fast sparse n-gram extraction from byte slices. + +Sparse grams select variable-length n-grams (2–8 bytes) without extracting all possible substrings. The algorithm is deterministic: the same extraction logic applies to every substring, making it suitable for substring search indexes. + +For background, see: +- [The technology behind GitHub's new code search](https://github.blog/engineering/architecture-optimization/the-technology-behind-githubs-new-code-search/#fn-69904-bignote) +- [Sparse n-grams: smarter trigram selection](https://cursor.com/blog/fast-regex-search#sparse-n-grams-smarter-trigram-selection) + +## Caveats + +The integrated bigram table contains only lowercase ASCII bigrams. Callers should lowercase and normalize input before extraction (e.g. fold uppercase to lowercase, map non-ASCII bytes to a single sentinel value). This makes the implementation suitable for case-insensitive search indexes. + +## How it works + +Each consecutive byte pair (bigram) is assigned a frequency-based priority from a precomputed table. An n-gram boundary occurs wherever a bigram has lower priority than all bigrams between it and the previous boundary. This is computed efficiently using a monotone deque or a scan-based approach. + +For a document of N bytes, this produces at most 3(N−1) n-grams: N−1 bigrams, plus up to 2(N−1) algorithmically selected longer n-grams (up to 8 bytes). + +### Selection criterion + +A substring of length 3–8 is emitted as a sparse n-gram if and only if every interior bigram priority is strictly greater than the maximum of the left and right boundary bigram priorities. + +## Usage + +```rust +use sparse_ngrams::{collect_sparse_grams, NGram, MAX_SPARSE_GRAM_SIZE}; + +let input = b"hello world"; +let grams = collect_sparse_grams(input); +for gram in &grams { + assert!(gram.len() >= 2); + assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize); +} +``` + +## Performance + +Benchmarks on an Apple M1 (15 KB input, `lib.rs` source file): + +| Variant | Throughput | +|---------|-----------| +| `deque` | ~3.5 GB/s | +| `scan` | ~4.9 GB/s | + +The `scan` variant is ~40% faster than the deque variant by replacing the monotone deque with a fixed-size circular buffer and a suffix-minimum scan. + +## Bigram table size + +The priority table maps byte pairs to frequency-based priorities. Increasing the table size (number of ranked bigrams) produces more distinct longer n-grams, but saturates quickly: + +![Unique n-grams vs. table size](images/unique_ngrams_vs_table_size.png) + +| Table size | Unique n-grams | % of max | +|-----------|-----------------|----------| +| 100 | 5.8M | 77.0% | +| 200 | 6.4M | 84.4% | +| 400 | 6.8M | 90.2% | +| 800 | 7.3M | 96.0% | +| 1,600 | 7.5M | 99.2% | +| 3,200 | 7.6M | 99.9% | +| 5,845 | 7.6M | 100% | + +The current bigram table contains the 5,845 most frequent bigrams from a large code corpus. +The table saturates quickly — the first ~1,600 bigrams already capture 99% of the unique n-grams. + +## Maximum n-gram length + +Increasing the maximum n-gram length produces more unique longer grams, with diminishing returns: + +![Unique n-grams vs. max length](images/unique_ngrams_vs_max_length.png) + +| Max length | Unique n-grams | vs. len=8 | +|-----------|---------------|-----------| +| 2 | 1.2M | 16% | +| 3 | 4.1M | 54% | +| 4 | 5.3M | 70% | +| 6 | 6.8M | 89% | +| 8 | 7.6M | 100% | +| 12 | 8.5M | 113% | +| 16 | 9.1M | 120% | +| 24 | 9.7M | 128% | +| 32 | 10.1M | 133% | +| 48 | 10.4M | 137% | +| 64 | 10.5M | 139% | + +The default of 8 captures most of the discriminative power. Going to 16 adds ~20% more unique grams but doubles the scan window; going to 64 adds only ~39% total. diff --git a/crates/sparse-ngrams/benchmarks/performance.rs b/crates/sparse-ngrams/benchmarks/performance.rs new file mode 100644 index 0000000..ac2b2c3 --- /dev/null +++ b/crates/sparse-ngrams/benchmarks/performance.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use sparse_ngrams::{ + collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams, NGram, +}; + +fn bench_collect(c: &mut Criterion) { + let inputs: Vec<(&str, Vec)> = vec![ + ("small_11B", b"hello world".to_vec()), + ( + "medium_900B", + "the quick brown fox jumps over the lazy dog. " + .repeat(20) + .into_bytes(), + ), + ( + "large_15KB", + include_str!("../src/lib.rs").as_bytes().to_vec(), + ), + ]; + + let mut group = c.benchmark_group("collect"); + for (name, input) in &inputs { + let mut buf = vec![NGram::from_bytes(b"xx"); max_sparse_grams(input.len())]; + group.throughput(Throughput::Bytes(input.len() as u64)); + + group.bench_with_input(BenchmarkId::new("deque", name), input, |b, input| { + b.iter(|| collect_sparse_grams_deque(black_box(input), &mut buf)) + }); + group.bench_with_input(BenchmarkId::new("scan", name), input, |b, input| { + b.iter(|| collect_sparse_grams_scan(black_box(input), &mut buf)) + }); + } + group.finish(); +} + +criterion_group!(benches, bench_collect); +criterion_main!(benches); diff --git a/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png b/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png new file mode 100644 index 0000000..9982a15 Binary files /dev/null and b/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png differ diff --git a/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png b/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png new file mode 100644 index 0000000..3876e3b Binary files /dev/null and b/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png differ diff --git a/crates/sparse-ngrams/src/bigrams.bin b/crates/sparse-ngrams/src/bigrams.bin new file mode 100644 index 0000000..d011c90 Binary files /dev/null and b/crates/sparse-ngrams/src/bigrams.bin differ diff --git a/crates/sparse-ngrams/src/deque.rs b/crates/sparse-ngrams/src/deque.rs new file mode 100644 index 0000000..3738ba0 --- /dev/null +++ b/crates/sparse-ngrams/src/deque.rs @@ -0,0 +1,71 @@ +//! Stack-allocated circular buffer (monotone deque). + +use std::mem::MaybeUninit; + +/// Deque element representing two neighboring bytes in the input. +#[derive(Debug, Clone, Copy)] +pub(crate) struct PosStateBytes { + /// Absolute index position between the two bigram characters. + /// I.e. 1 references the very first bigram. + pub index: u32, + pub value: u16, +} + +/// Stack-allocated circular buffer holding up to `CAP` elements. +/// Replaces `VecDeque` — avoids heap allocation and fits in a +/// single cache line for small CAP values. +pub(crate) struct FixedDeque { + data: [MaybeUninit; CAP], + start: u8, + len: u8, +} + +impl FixedDeque { + pub fn new() -> Self { + Self { + data: [MaybeUninit::uninit(); CAP], + start: 0, + len: 0, + } + } + + #[inline] + pub fn front(&self) -> Option<&PosStateBytes> { + if self.len == 0 { + None + } else { + Some(unsafe { self.data[self.start as usize].assume_init_ref() }) + } + } + + #[inline] + pub fn back(&self) -> Option<&PosStateBytes> { + if self.len == 0 { + None + } else { + let idx = (self.start + self.len - 1) as usize % CAP; + Some(unsafe { self.data[idx].assume_init_ref() }) + } + } + + #[inline] + pub fn pop_front(&mut self) { + debug_assert!(self.len > 0); + self.start = (self.start + 1) % CAP as u8; + self.len -= 1; + } + + #[inline] + pub fn pop_back(&mut self) { + debug_assert!(self.len > 0); + self.len -= 1; + } + + #[inline] + pub fn push_back(&mut self, val: PosStateBytes) { + debug_assert!((self.len as usize) < CAP); + let idx = (self.start + self.len) as usize % CAP; + self.data[idx] = MaybeUninit::new(val); + self.len += 1; + } +} diff --git a/crates/sparse-ngrams/src/extract.rs b/crates/sparse-ngrams/src/extract.rs new file mode 100644 index 0000000..3b55c20 --- /dev/null +++ b/crates/sparse-ngrams/src/extract.rs @@ -0,0 +1,364 @@ +//! Core sparse n-gram extraction algorithm. + +use crate::deque::{FixedDeque, PosStateBytes}; +use crate::ngram::{NGram, POLY_HASH_PRIME, POLY_POWERS}; +use crate::table::get_bigram_table; +use crate::MAX_SPARSE_GRAM_SIZE; + +/// Returns the maximum number of sparse n-grams that can be produced from +/// `content_len` bytes of input. Use this to pre-allocate the output slice. +#[inline] +pub const fn max_sparse_grams(content_len: usize) -> usize { + if content_len < 2 { + 0 + } else { + (content_len - 1) * 3 + } +} + +/// Collect all sparse n-grams from the input byte slice into a new [`Vec`]. +pub fn collect_sparse_grams(content: &[u8]) -> Vec { + let mut buf = vec![NGram::from_rolling_hash(0, 0); max_sparse_grams(content.len())]; + let count = collect_sparse_grams_deque(content, &mut buf); + buf.truncate(count); + buf +} + +/// Deque-based extraction. Writes n-grams into `out` (must have at least +/// [`max_sparse_grams`]`(content.len())` slots). Returns the count written. +/// +/// # Panics +/// +/// Panics if `out` is too small. +pub fn collect_sparse_grams_deque(content: &[u8], out: &mut [NGram]) -> usize { + let n = content.len(); + if n < 2 { + return 0; + } + assert!(out.len() >= max_sparse_grams(n)); + let table = get_bigram_table(); + let mut queue = FixedDeque::::new(); + let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE]; + prefix_hashes[1] = content[0] as u32; + let mut w = 0usize; + + for idx in 1..n as u32 { + let mask = MAX_SPARSE_GRAM_SIZE - 1; + let end_hash = prefix_hashes[idx as usize & mask] + .wrapping_mul(POLY_HASH_PRIME) + .wrapping_add(content[idx as usize] as u32); + + // Bigram + let bigram_hash = end_hash + .wrapping_sub(prefix_hashes[(idx as usize - 1) & mask].wrapping_mul(POLY_POWERS[2])); + out[w] = NGram::from_rolling_hash(bigram_hash, 2); + w += 1; + + let v1 = table[content[idx as usize - 1] as usize * 256 + content[idx as usize] as usize]; + + if let Some(begin) = queue.front() { + if idx - begin.index + 1 >= MAX_SPARSE_GRAM_SIZE as u32 { + queue.pop_front(); + } + } + while let Some(begin) = queue.back() { + let start = begin.index as usize - 1; + let len = (idx - begin.index + 2) as usize; + let hash = + end_hash.wrapping_sub(prefix_hashes[start & mask].wrapping_mul(POLY_POWERS[len])); + out[w] = NGram::from_rolling_hash(hash, len); + w += 1; + if begin.value == v1 { + queue.pop_back(); + break; + } else if begin.value <= v1 { + break; + } + queue.pop_back(); + } + queue.push_back(PosStateBytes { + index: idx, + value: v1, + }); + prefix_hashes[(idx as usize + 1) & mask] = end_hash; + } + w +} + +/// Queue-free scan-based extraction. Writes n-grams into `out` (must have at least +/// [`max_sparse_grams`]`(content.len())` slots). Returns the count written. +/// +/// Produces identical output (same order) as [`collect_sparse_grams_deque`]. +/// +/// # Panics +/// +/// Panics if `out` is too small. +pub fn collect_sparse_grams_scan(content: &[u8], out: &mut [NGram]) -> usize { + let n = content.len(); + if n < 2 { + return 0; + } + assert!(out.len() >= max_sparse_grams(n)); + + let table = get_bigram_table(); + const MASK: usize = MAX_SPARSE_GRAM_SIZE - 1; + let mut w = 0usize; + let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE]; + prefix_hashes[1] = content[0] as u32; + let mut priorities = [u16::MAX; MAX_SPARSE_GRAM_SIZE]; + for idx in 1..n as u32 { + let end_hash = prefix_hashes[idx as usize & MASK] + .wrapping_mul(POLY_HASH_PRIME) + .wrapping_add(content[idx as usize] as u32); + // Bigram + let bigram_hash = end_hash + .wrapping_sub(prefix_hashes[(idx as usize - 1) & MASK].wrapping_mul(POLY_POWERS[2])); + out[w] = NGram::from_rolling_hash(bigram_hash, 2); + w += 1; + let v1 = table[content[idx as usize - 1] as usize * 256 + content[idx as usize] as usize]; + priorities[idx as usize & MASK] = v1; + let mut running_min = u16::MAX; + for d in 1..=(MAX_SPARSE_GRAM_SIZE as u32 - 2) { + if d >= idx { + break; + } + let p = idx.wrapping_sub(d) as usize & MASK; + let v_p = priorities[p]; + if v_p < running_min { + running_min = v_p; + let start = p.wrapping_sub(1) & MASK; + let len = d as usize + 2; + let hash = + end_hash.wrapping_sub(prefix_hashes[start].wrapping_mul(POLY_POWERS[len])); + out[w] = NGram::from_rolling_hash(hash, len); + w += 1; + if v_p <= v1 { + break; + } + } + } + prefix_hashes[(idx as usize + 1) & MASK] = end_hash; + } + w +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::table::get_bigram_table; + use std::collections::HashSet; + + fn collect_to_vec(content: &[u8], f: fn(&[u8], &mut [NGram]) -> usize) -> Vec { + let mut buf = vec![NGram::from_rolling_hash(0, 0); max_sparse_grams(content.len())]; + let count = f(content, &mut buf); + buf.truncate(count); + buf + } + + /// Brute-force reference implementation. + /// + /// Enumerates all substrings of length 2..=MAX_SPARSE_GRAM_SIZE and emits those + /// where every interior bigram priority is strictly greater than `max(left, right)` + /// boundary bigram priority. All bigrams (len=2) are always emitted. + fn brute_force_sparse_grams(content: &[u8]) -> HashSet { + let table = get_bigram_table(); + let n = content.len(); + let mut result = HashSet::new(); + if n < 2 { + return result; + } + // All bigrams. + for i in 0..n - 1 { + result.insert(NGram::from_bytes(&content[i..i + 2])); + } + // Longer grams: length 3..=MAX_SPARSE_GRAM_SIZE. + for len in 3..=MAX_SPARSE_GRAM_SIZE { + 'outer: for start in 0..=n.saturating_sub(len) { + if start + len > n { + break; + } + let left = table[content[start] as usize * 256 + content[start + 1] as usize]; + let right = table + [content[start + len - 2] as usize * 256 + content[start + len - 1] as usize]; + let boundary = left.max(right); + // Inner bigrams: bytes [start+1,start+2], ..., [start+len-3,start+len-2] + for k in 1..len - 2 { + let p = + table[content[start + k] as usize * 256 + content[start + k + 1] as usize]; + if p <= boundary { + continue 'outer; + } + } + result.insert(NGram::from_bytes(&content[start..start + len])); + } + } + result + } + + #[test] + fn test_empty_input() { + assert!(collect_sparse_grams(b"").is_empty()); + } + + #[test] + fn test_single_byte() { + assert!(collect_sparse_grams(b"a").is_empty()); + } + + #[test] + fn test_two_bytes() { + let grams = collect_sparse_grams(b"ab"); + assert_eq!(grams.len(), 1); + assert_eq!(grams[0], NGram::from_bytes(b"ab")); + } + + #[test] + fn test_three_bytes() { + let grams = collect_sparse_grams(b"abc"); + assert!(grams.len() >= 2); + assert_eq!(grams[0], NGram::from_bytes(b"ab")); + assert_eq!(grams[1], NGram::from_bytes(b"bc")); + } + + #[test] + fn test_gram_lengths_bounded() { + let input = b"self.reset_states(the_quick_brown_fox_jumps"; + let grams = collect_sparse_grams(input); + for gram in &grams { + assert!(gram.len() >= 2, "gram too short: {gram:?}"); + assert!( + gram.len() <= MAX_SPARSE_GRAM_SIZE, + "gram too long: {gram:?}" + ); + } + } + + #[test] + fn test_produces_longer_grams() { + let grams = collect_sparse_grams(b"self.reset_states("); + assert!(grams.iter().any(|g| g.len() > 2)); + } + + #[test] + fn test_max_gram_size_boundary() { + let grams = collect_sparse_grams(b"abcdefgh"); + for gram in &grams { + assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE); + } + } + + #[test] + fn test_repeated_bytes() { + let grams = collect_sparse_grams(b"aaaaaaaaaa"); + assert!(grams.iter().filter(|g| g.len() == 2).count() >= 9); + } + + #[test] + fn test_gram_count_scales_linearly() { + let input: Vec = (0..1000).map(|i| (i % 256) as u8).collect(); + let grams = collect_sparse_grams(&input); + assert!(grams.len() >= input.len() - 1); + assert!(grams.len() <= input.len() * 3); + } + + // -- Equivalence: scan vs deque -- + + #[test] + fn test_scan_equivalence_small() { + for input in [b"" as &[u8], b"x", b"ab", b"abc", b"abcdefgh", b"abcdefghi"] { + assert_eq!( + collect_to_vec(input, collect_sparse_grams_deque), + collect_to_vec(input, collect_sparse_grams_scan), + "mismatch on {:?}", + std::str::from_utf8(input).unwrap_or("?") + ); + } + } + + #[test] + fn test_scan_equivalence_hello_world() { + let input = b"hello world"; + assert_eq!( + collect_to_vec(input, collect_sparse_grams_deque), + collect_to_vec(input, collect_sparse_grams_scan), + ); + } + + #[test] + fn test_scan_equivalence_large() { + let input: Vec = (0..1000).map(|i| (i % 256) as u8).collect(); + assert_eq!( + collect_to_vec(&input, collect_sparse_grams_deque), + collect_to_vec(&input, collect_sparse_grams_scan), + ); + } + + #[test] + fn test_scan_equivalence_source_code() { + let input = include_bytes!("lib.rs"); + assert_eq!( + collect_to_vec(input, collect_sparse_grams_deque), + collect_to_vec(input, collect_sparse_grams_scan), + ); + } + + // -- Brute-force equivalence -- + + fn assert_matches_brute_force(input: &[u8]) { + let grams = collect_sparse_grams(input); + let actual: HashSet = grams.into_iter().collect(); + let expected = brute_force_sparse_grams(input); + let only_actual: Vec<_> = actual.difference(&expected).collect(); + let only_expected: Vec<_> = expected.difference(&actual).collect(); + if !only_actual.is_empty() || !only_expected.is_empty() { + panic!( + "mismatch on input len={}\n only in algorithm: {:?}\n only in brute force: {:?}", + input.len(), + only_actual, + only_expected + ); + } + } + + #[test] + fn test_brute_force_small() { + for input in [ + b"" as &[u8], + b"x", + b"ab", + b"abc", + b"abcd", + b"abcdefgh", + b"abcdefghi", + ] { + assert_matches_brute_force(input); + } + } + + #[test] + fn test_brute_force_hello_world() { + assert_matches_brute_force(b"hello world"); + } + + #[test] + fn test_brute_force_repeated() { + assert_matches_brute_force(b"aaaaaaaaaa"); + } + + #[test] + fn test_brute_force_code_snippet() { + assert_matches_brute_force(b"self.reset_states(the_quick_brown_fox_jumps"); + } + + #[test] + fn test_brute_force_diverse() { + let input: Vec = (0..200).map(|i| (i % 256) as u8).collect(); + assert_matches_brute_force(&input); + } + + #[test] + fn test_brute_force_source_code() { + let input = include_bytes!("lib.rs"); + assert_matches_brute_force(input); + } +} diff --git a/crates/sparse-ngrams/src/lib.rs b/crates/sparse-ngrams/src/lib.rs new file mode 100644 index 0000000..aa45abc --- /dev/null +++ b/crates/sparse-ngrams/src/lib.rs @@ -0,0 +1,48 @@ +//! Sparse n-gram extraction from byte slices. +//! +//! Sparse grams are a way of selecting variable-length n-grams (longer than 2 bytes) without +//! extracting all possible n-grams. The algorithm is deterministic: the same extraction logic +//! works for every substring, so that substring searches are supported. +//! +//! # How it works +//! +//! Each consecutive byte pair (bigram) is assigned a priority based on how frequently it occurs +//! in a large code corpus. A monotone deque tracks potential n-gram boundaries: an n-gram +//! boundary occurs wherever a bigram has lower priority than all bigrams between it and the +//! previous boundary. +//! +//! For a document of N bytes, this produces at most 3(N-1) n-grams: all bigrams plus algorithmically +//! selected longer n-grams (up to [`MAX_SPARSE_GRAM_SIZE`] bytes). +//! +//! # Example +//! +//! ``` +//! use sparse_ngrams::{NGram, collect_sparse_grams, MAX_SPARSE_GRAM_SIZE}; +//! +//! let input = b"hello world"; +//! let grams = collect_sparse_grams(input); +//! assert!(grams.len() > input.len() - 1); +//! for gram in &grams { +//! assert!(gram.len() >= 2); +//! assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE); +//! } +//! ``` + +mod deque; +mod extract; +mod ngram; +mod table; + +pub use ngram::NGram; + +/// Number of high-frequency bigrams used to build the priority table. +/// We reserve u16::MAX (65535), since some algorithms need a max value. +/// We also reserve 0 for all non-frequent bigrams. +pub const NUM_FREQUENT_BIGRAMS: usize = 65534; + +/// Maximum length (in bytes) of a sparse n-gram. +pub const MAX_SPARSE_GRAM_SIZE: usize = 8; + +pub use extract::{ + collect_sparse_grams, collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams, +}; diff --git a/crates/sparse-ngrams/src/ngram.rs b/crates/sparse-ngrams/src/ngram.rs new file mode 100644 index 0000000..4ff966b --- /dev/null +++ b/crates/sparse-ngrams/src/ngram.rs @@ -0,0 +1,140 @@ +//! Compact n-gram representation using a polynomial rolling hash. +//! +//! An [`NGram`] packs both a hash and the byte length into a single `u32`: +//! the upper 24 bits hold the rolling hash and the lower 8 bits hold the length. +//! This makes it suitable as a cheap, fixed-size key for hash maps and sets. + +use std::fmt; + +use crate::MAX_SPARSE_GRAM_SIZE; + +/// Prime for the polynomial rolling hash. +pub(crate) const POLY_HASH_PRIME: u32 = 2_654_435_761; + +/// Precomputed powers of [`POLY_HASH_PRIME`] for rolling-hash range queries. +/// `POLY_POWERS[i] = POLY_HASH_PRIME.pow(i)` (wrapping `u32`). +pub(crate) const POLY_POWERS: [u32; MAX_SPARSE_GRAM_SIZE + 1] = { + let mut p = [0u32; MAX_SPARSE_GRAM_SIZE + 1]; + p[0] = 1; + let mut i = 1; + while i < p.len() { + p[i] = (p[i - 1] as u64 * POLY_HASH_PRIME as u64) as u32; + i += 1; + } + p +}; + +/// A compact n-gram identifier: upper 24 bits are a polynomial rolling hash, +/// lower 8 bits are the byte length of the n-gram. +/// +/// Note: With could also store ngrams up to length 8 in an u64. However, this +/// would explode the number of ngram keys in a search dictionary. For that reason, +/// we compress ngrams into an u32 which puts a more reasonable upper bound on +/// the number of dictionary keys (~100 million). +/// +/// Note: By storing the length explicitly in the lower 8 bits, we ensure that +/// only ngrams of the same length collide. This is important because there +/// are exponentially more long ngrams than short ngrams. At the same time, +/// longer ngrams occur less frequently. So, colliding long ngrams won't increase +/// the false positive rate too much. +/// +/// # Construction +/// +/// Use [`NGram::from_bytes`] for one-off hashing, or the rolling-hash helpers +/// inside the extraction loop for amortised O(1) computation per n-gram. +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[repr(transparent)] +pub struct NGram(pub(crate) u32); + +impl NGram { + /// Build an `NGram` by hashing the given byte slice from scratch. + pub fn from_bytes(src: &[u8]) -> Self { + let mut hash = 0u32; + for &byte in src { + hash = hash.wrapping_mul(POLY_HASH_PRIME).wrapping_add(byte as u32); + } + Self((hash << 8) | src.len() as u32) + } + + /// Build an `NGram` from a precomputed rolling hash and a length. + #[inline] + pub(crate) fn from_rolling_hash(hash: u32, len: usize) -> Self { + Self((hash << 8) | len as u32) + } + + /// The byte length of the n-gram (stored in the lower 8 bits). + #[inline] + pub fn len(&self) -> usize { + (self.0 & 0xff) as usize + } + + /// Whether this represents an empty gram (should never happen in practice). + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// The raw packed `u32` (hash ≪ 8 | len). + #[inline] + pub fn as_u32(&self) -> u32 { + self.0 + } +} + +impl fmt::Debug for NGram { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "NGram({:#x}, len={})", self.0 >> 8, self.len()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_bytes_roundtrip() { + let ngram = NGram::from_bytes(b"hello"); + assert_eq!(ngram.len(), 5); + } + + #[test] + fn test_equal_content_equal_ngram() { + assert_eq!(NGram::from_bytes(b"abc"), NGram::from_bytes(b"abc")); + } + + #[test] + fn test_different_content_likely_different() { + assert_ne!(NGram::from_bytes(b"abc"), NGram::from_bytes(b"abd")); + } + + #[test] + fn test_same_hash_different_length() { + // Even if hashes collide, different lengths produce different NGrams. + let a = NGram::from_bytes(b"ab"); + let b = NGram::from_bytes(b"abc"); + assert_ne!(a, b); + } + + #[test] + fn test_rolling_hash_matches_from_bytes() { + let content = b"hello world"; + // Build prefix hashes the same way the extraction loop does. + let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE]; + if !content.is_empty() { + prefix_hashes[1] = content[0] as u32; + } + for idx in 1..content.len() { + let end_hash = prefix_hashes[idx & (MAX_SPARSE_GRAM_SIZE - 1)] + .wrapping_mul(POLY_HASH_PRIME) + .wrapping_add(content[idx] as u32); + // Check the bigram content[idx-1..idx+1] + let rolling_hash = end_hash.wrapping_sub( + prefix_hashes[(idx - 1) & (MAX_SPARSE_GRAM_SIZE - 1)].wrapping_mul(POLY_POWERS[2]), + ); + let rolling = NGram::from_rolling_hash(rolling_hash, 2); + let direct = NGram::from_bytes(&content[idx - 1..idx + 1]); + assert_eq!(rolling, direct, "mismatch at idx={idx}"); + prefix_hashes[(idx + 1) & (MAX_SPARSE_GRAM_SIZE - 1)] = end_hash; + } + } +} diff --git a/crates/sparse-ngrams/src/table.rs b/crates/sparse-ngrams/src/table.rs new file mode 100644 index 0000000..96c77f6 --- /dev/null +++ b/crates/sparse-ngrams/src/table.rs @@ -0,0 +1,41 @@ +//! Bigram priority table. +//! +//! Assigns a frequency-based priority to each byte pair, used by the sparse n-gram +//! extraction algorithm to decide where n-gram boundaries fall. + +use std::sync::OnceLock; + +use crate::NUM_FREQUENT_BIGRAMS; + +/// The bigrams in this string are sorted by how frequently they occur in code (descending). +/// Bigrams are separated by null bytes. +/// Currently contains only the top 5845 bigrams (ascii, case-insensitive). +static BIGRAMS_STR: &str = include_str!("bigrams.bin"); + +/// Flat 256×256 lookup table indexed by `a as usize * 256 + b`. +/// Entries default to 0 for bigrams not in the frequency table. +static BIGRAM_TABLE: OnceLock> = OnceLock::new(); + +/// Returns the bigram priority table. The first call initializes it (thread-safe). +pub(crate) fn get_bigram_table() -> &'static [u16; 256 * 256] { + BIGRAM_TABLE.get_or_init(|| { + let mut table = Box::new([0u16; 256 * 256]); + for (idx, s) in BIGRAMS_STR + .split('\0') + .take(NUM_FREQUENT_BIGRAMS) + .enumerate() + { + let mut chars = s.chars(); + let Some((a, b)) = chars.next().zip(chars.next()) else { + continue; + }; + let a = a as u8; + let b = b as u8; + assert_eq!(table[a as usize * 256 + b as usize], 0); + // Higher-frequency bigrams get HIGHER values so they are more often + // encompassed by longer grams. + table[a as usize * 256 + b as usize] = (NUM_FREQUENT_BIGRAMS - idx) as u16; + } + table + }) +} diff --git a/crates/string-offsets/js/package-lock.json b/crates/string-offsets/js/package-lock.json index 9a381ab..fdc0153 100644 --- a/crates/string-offsets/js/package-lock.json +++ b/crates/string-offsets/js/package-lock.json @@ -10,8 +10,7 @@ "license": "MIT", "devDependencies": { "@types/jest": "^29.5.14", - "jest": "^29.0.0", - "wasm-pack": "^0.14.0" + "jest": "^29.0.0" } }, "node_modules/@babel/code-frame": { @@ -30,9 +29,9 @@ } }, "node_modules/@babel/compat-data": { - "version": "7.29.0", - "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.0.tgz", - "integrity": "sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg==", + "version": "7.29.3", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.3.tgz", + "integrity": "sha512-LIVqM46zQWZhj17qA8wb4nW/ixr2y1Nw+r1etiAWgRM6U1IqP+LNhL1yg440jYZR72jCWcWbLWzIosH+uP1fqg==", "dev": true, "license": "MIT", "engines": { @@ -187,23 +186,23 @@ } }, "node_modules/@babel/helpers": { - "version": "7.28.6", - "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.6.tgz", - "integrity": "sha512-xOBvwq86HHdB7WUDTfKfT/Vuxh7gElQ+Sfti2Cy6yIWNW05P8iUslOVcZ4/sKbE+/jQaukQAdz/gf3724kYdqw==", + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.29.2.tgz", + "integrity": "sha512-HoGuUs4sCZNezVEKdVcwqmZN8GoHirLUcLaYVNBK2J0DadGtdcqgr3BCbvH8+XUo4NGjNl3VOtSjEKNzqfFgKw==", "dev": true, "license": "MIT", "dependencies": { "@babel/template": "^7.28.6", - "@babel/types": "^7.28.6" + "@babel/types": "^7.29.0" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/parser": { - "version": "7.29.0", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.0.tgz", - "integrity": "sha512-IyDgFV5GeDUVX4YdF/3CPULtVGSXXMLh1xVIgdCgxApktqnQV0r7/8Nqthg+8YLGaAtdyIlo2qIdZrbCv4+7ww==", + "version": "7.29.3", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.3.tgz", + "integrity": "sha512-b3ctpQwp+PROvU/cttc4OYl4MzfJUWy6FZg+PMXfzmt/+39iHVF0sDfqay8TQM3JA2EUOyKcFZt75jWriQijsA==", "dev": true, "license": "MIT", "dependencies": { @@ -510,19 +509,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@isaacs/fs-minipass": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", - "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", - "dev": true, - "license": "ISC", - "dependencies": { - "minipass": "^7.0.4" - }, - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", @@ -541,9 +527,9 @@ } }, "node_modules/@istanbuljs/schema": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/@istanbuljs/schema/-/schema-0.1.3.tgz", - "integrity": "sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==", + "version": "0.1.6", + "resolved": "https://registry.npmjs.org/@istanbuljs/schema/-/schema-0.1.6.tgz", + "integrity": "sha512-+Sg6GCR/wy1oSmQDFq4LQDAhm3ETKnorxN+y5nbLULOR3P0c14f2Wurzj3/xqPXtasLFfHd5iRFQ7AJt4KH2cw==", "dev": true, "license": "MIT", "engines": { @@ -1013,13 +999,13 @@ } }, "node_modules/@types/node": { - "version": "25.4.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-25.4.0.tgz", - "integrity": "sha512-9wLpoeWuBlcbBpOY3XmzSTG3oscB6xjBEEtn+pYXTfhyXhIxC5FsBer2KTopBlvKEiW9l13po9fq+SJY/5lkhw==", + "version": "25.7.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.7.0.tgz", + "integrity": "sha512-z+pdZyxE+RTQE9AcboAZCb4otwcrvgHD+GlBpPgn0emDVt0ohrTMhAwlr2Wd9nZ+nihhYFxO2pThz3C5qSu2Eg==", "dev": true, "license": "MIT", "dependencies": { - "undici-types": "~7.18.0" + "undici-types": "~7.21.0" } }, "node_modules/@types/stack-utils": { @@ -1112,25 +1098,6 @@ "sprintf-js": "~1.0.2" } }, - "node_modules/asynckit": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/axios": { - "version": "1.15.0", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.15.0.tgz", - "integrity": "sha512-wWyJDlAatxk30ZJer+GeCWS209sA42X+N5jU2jy6oHTp7ufw8uzUTVFBX9+wTfAlhiJXGS0Bq7X6efruWjuK9Q==", - "dev": true, - "license": "MIT", - "dependencies": { - "follow-redirects": "^1.15.11", - "form-data": "^4.0.5", - "proxy-from-env": "^2.1.0" - } - }, "node_modules/babel-jest": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-29.7.0.tgz", @@ -1255,9 +1222,9 @@ "license": "MIT" }, "node_modules/baseline-browser-mapping": { - "version": "2.10.7", - "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.7.tgz", - "integrity": "sha512-1ghYO3HnxGec0TCGBXiDLVns4eCSx4zJpxnHrlqFQajmhfKMQBzUGDdkMK7fUW7PTHTeLf+j87aTuKuuwWzMGw==", + "version": "2.10.29", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.29.tgz", + "integrity": "sha512-Asa2krT+XTPZINCS+2QcyS8WTkObE77RwkydwF7h6DmnKqbvlalz93m/dnphUyCa6SWSP51VgtEUf2FN+gelFQ==", "dev": true, "license": "Apache-2.0", "bin": { @@ -1267,26 +1234,10 @@ "node": ">=6.0.0" } }, - "node_modules/binary-install": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/binary-install/-/binary-install-1.1.2.tgz", - "integrity": "sha512-ZS2cqFHPZOy4wLxvzqfQvDjCOifn+7uCPqNmYRIBM/03+yllON+4fNnsD0VJdW0p97y+E+dTRNPStWNqMBq+9g==", - "deprecated": "Package no longer supported. Contact Support at https://www.npmjs.com/support for more info.", - "dev": true, - "license": "MIT", - "dependencies": { - "axios": "^0.26.1", - "rimraf": "^3.0.2", - "tar": "^6.1.11" - }, - "engines": { - "node": ">=10" - } - }, "node_modules/brace-expansion": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", - "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", + "version": "1.1.14", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.14.tgz", + "integrity": "sha512-MWPGfDxnyzKU7rNOW9SP/c50vi3xrmrua/+6hfPbCS2ABNWfx24vPidzvC7krjU/RTo235sV776ymlsMtGKj8g==", "dev": true, "license": "MIT", "dependencies": { @@ -1308,9 +1259,9 @@ } }, "node_modules/browserslist": { - "version": "4.28.1", - "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz", - "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==", + "version": "4.28.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.2.tgz", + "integrity": "sha512-48xSriZYYg+8qXna9kwqjIVzuQxi+KYWp2+5nCYnYKPTr0LvD89Jqk2Or5ogxz0NUMfIjhh2lIUX/LyX9B4oIg==", "dev": true, "funding": [ { @@ -1328,11 +1279,11 @@ ], "license": "MIT", "dependencies": { - "baseline-browser-mapping": "^2.9.0", - "caniuse-lite": "^1.0.30001759", - "electron-to-chromium": "^1.5.263", - "node-releases": "^2.0.27", - "update-browserslist-db": "^1.2.0" + "baseline-browser-mapping": "^2.10.12", + "caniuse-lite": "^1.0.30001782", + "electron-to-chromium": "^1.5.328", + "node-releases": "^2.0.36", + "update-browserslist-db": "^1.2.3" }, "bin": { "browserslist": "cli.js" @@ -1358,20 +1309,6 @@ "dev": true, "license": "MIT" }, - "node_modules/call-bind-apply-helpers": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", - "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "es-errors": "^1.3.0", - "function-bind": "^1.1.2" - }, - "engines": { - "node": ">= 0.4" - } - }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -1393,9 +1330,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001778", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001778.tgz", - "integrity": "sha512-PN7uxFL+ExFJO61aVmP1aIEG4i9whQd4eoSCebav62UwDyp5OHh06zN4jqKSMePVgxHifCw1QJxdRkA1Pisekg==", + "version": "1.0.30001792", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001792.tgz", + "integrity": "sha512-hVLMUZFgR4JJ6ACt1uEESvQN1/dBVqPAKY0hgrV70eN3391K6juAfTjKZLKvOMsx8PxA7gsY1/tLMMTcfFLLpw==", "dev": true, "funding": [ { @@ -1440,16 +1377,6 @@ "node": ">=10" } }, - "node_modules/chownr": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", - "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -1526,19 +1453,6 @@ "dev": true, "license": "MIT" }, - "node_modules/combined-stream": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", - "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", - "dev": true, - "license": "MIT", - "dependencies": { - "delayed-stream": "~1.0.0" - }, - "engines": { - "node": ">= 0.8" - } - }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", @@ -1633,16 +1547,6 @@ "node": ">=0.10.0" } }, - "node_modules/delayed-stream": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", - "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.4.0" - } - }, "node_modules/detect-newline": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz", @@ -1663,25 +1567,10 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/dunder-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", - "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", - "dev": true, - "license": "MIT", - "dependencies": { - "call-bind-apply-helpers": "^1.0.1", - "es-errors": "^1.3.0", - "gopd": "^1.2.0" - }, - "engines": { - "node": ">= 0.4" - } - }, "node_modules/electron-to-chromium": { - "version": "1.5.313", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.313.tgz", - "integrity": "sha512-QBMrTWEf00GXZmJyx2lbYD45jpI3TUFnNIzJ5BBc8piGUDwMPa1GV6HJWTZVvY/eiN3fSopl7NRbgGp9sZ9LTA==", + "version": "1.5.353", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.353.tgz", + "integrity": "sha512-kOrWphBi8TOZyiJZqsgqIle0lw+tzmnQK83pV9dZUd01Nm2POECSyFQMAuarzZdYqQW7FH9RaYOuaRo3h+bQ3w==", "dev": true, "license": "ISC" }, @@ -1715,16 +1604,6 @@ "is-arrayish": "^0.2.1" } }, - "node_modules/es-define-property": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", - "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - } - }, "node_modules/es-errors": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", @@ -1735,35 +1614,6 @@ "node": ">= 0.4" } }, - "node_modules/es-object-atoms": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", - "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", - "dev": true, - "license": "MIT", - "dependencies": { - "es-errors": "^1.3.0" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-set-tostringtag": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", - "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", - "dev": true, - "license": "MIT", - "dependencies": { - "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.6", - "has-tostringtag": "^1.0.2", - "hasown": "^2.0.2" - }, - "engines": { - "node": ">= 0.4" - } - }, "node_modules/escalade": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", @@ -1892,44 +1742,6 @@ "node": ">=8" } }, - "node_modules/follow-redirects": { - "version": "1.16.0", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.16.0.tgz", - "integrity": "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==", - "dev": true, - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], - "license": "MIT", - "engines": { - "node": ">=4.0" - }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } - } - }, - "node_modules/form-data": { - "version": "4.0.5", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", - "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", - "dev": true, - "license": "MIT", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.8", - "es-set-tostringtag": "^2.1.0", - "hasown": "^2.0.2", - "mime-types": "^2.1.12" - }, - "engines": { - "node": ">= 6" - } - }, "node_modules/fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -1982,31 +1794,6 @@ "node": "6.* || 8.* || >= 10.*" } }, - "node_modules/get-intrinsic": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", - "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "call-bind-apply-helpers": "^1.0.2", - "es-define-property": "^1.0.1", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.1.1", - "function-bind": "^1.1.2", - "get-proto": "^1.0.1", - "gopd": "^1.2.0", - "has-symbols": "^1.1.0", - "hasown": "^2.0.2", - "math-intrinsics": "^1.1.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/get-package-type": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/get-package-type/-/get-package-type-0.1.0.tgz", @@ -2017,20 +1804,6 @@ "node": ">=8.0.0" } }, - "node_modules/get-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", - "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", - "dev": true, - "license": "MIT", - "dependencies": { - "dunder-proto": "^1.0.1", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - } - }, "node_modules/get-stream": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz", @@ -2066,19 +1839,6 @@ "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/gopd": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", - "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -2096,39 +1856,10 @@ "node": ">=8" } }, - "node_modules/has-symbols": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", - "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/has-tostringtag": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", - "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", - "dev": true, - "license": "MIT", - "dependencies": { - "has-symbols": "^1.0.3" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/hasown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", - "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.3.tgz", + "integrity": "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==", "dev": true, "license": "MIT", "dependencies": { @@ -2212,13 +1943,13 @@ "license": "MIT" }, "node_modules/is-core-module": { - "version": "2.16.1", - "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", - "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==", + "version": "2.16.2", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.2.tgz", + "integrity": "sha512-evOr8xfXKxE6qSR0hSXL2r3sd7ALj8+7jQEUvPYcm5sgZFdJ+AYzT6yNmJenvIYQBgIGwfwz08sL8zoL7yq2BA==", "dev": true, "license": "MIT", "dependencies": { - "hasown": "^2.0.2" + "hasown": "^2.0.3" }, "engines": { "node": ">= 0.4" @@ -2305,9 +2036,9 @@ } }, "node_modules/istanbul-lib-instrument/node_modules/semver": { - "version": "7.7.4", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", - "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz", + "integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==", "dev": true, "license": "ISC", "bin": { @@ -2844,9 +2575,9 @@ } }, "node_modules/jest-snapshot/node_modules/semver": { - "version": "7.7.4", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", - "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz", + "integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==", "dev": true, "license": "ISC", "bin": { @@ -3078,9 +2809,9 @@ } }, "node_modules/make-dir/node_modules/semver": { - "version": "7.7.4", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", - "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz", + "integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==", "dev": true, "license": "ISC", "bin": { @@ -3100,16 +2831,6 @@ "tmpl": "1.0.5" } }, - "node_modules/math-intrinsics": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", - "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - } - }, "node_modules/merge-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", @@ -3131,29 +2852,6 @@ "node": ">=8.6" } }, - "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "dev": true, - "license": "MIT", - "dependencies": { - "mime-db": "1.52.0" - }, - "engines": { - "node": ">= 0.6" - } - }, "node_modules/mimic-fn": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz", @@ -3177,29 +2875,6 @@ "node": "*" } }, - "node_modules/minipass": { - "version": "7.1.3", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz", - "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/minizlib": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.1.0.tgz", - "integrity": "sha512-KZxYo1BUkWD2TVFLr0MQoM8vUUigWD3LlD83a/75BqC+4qE0Hb1Vo5v1FgcfaNXvfXzr+5EhQ6ing/CaBijTlw==", - "dev": true, - "license": "MIT", - "dependencies": { - "minipass": "^7.1.2" - }, - "engines": { - "node": ">= 18" - } - }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", @@ -3222,9 +2897,9 @@ "license": "MIT" }, "node_modules/node-releases": { - "version": "2.0.36", - "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.36.tgz", - "integrity": "sha512-TdC8FSgHz8Mwtw9g5L4gR/Sh9XhSP/0DEkQxfEFXOpiul5IiHgHan2VhYYb6agDSfp4KuvltmGApc8HMgUrIkA==", + "version": "2.0.44", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.44.tgz", + "integrity": "sha512-5WUyunoPMsvvEhS8AxHtRzP+oA8UCkJ7YRxatWKjngndhDGLiqEVAQKWjFAiAiuL8zMRGzGSJxFnLetoa43qGQ==", "dev": true, "license": "MIT" }, @@ -3396,9 +3071,9 @@ "license": "ISC" }, "node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", + "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", "dev": true, "license": "MIT", "engines": { @@ -3473,16 +3148,6 @@ "node": ">= 6" } }, - "node_modules/proxy-from-env": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-2.1.0.tgz", - "integrity": "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - } - }, "node_modules/pure-rand": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", @@ -3518,12 +3183,13 @@ } }, "node_modules/resolve": { - "version": "1.22.11", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.11.tgz", - "integrity": "sha512-RfqAvLnMl313r7c9oclB1HhUEAezcpLjz95wFH4LVuhk9JF/r22qmVP9AMmOU4vMX7Q8pN8jwNg/CSpdFnMjTQ==", + "version": "1.22.12", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.12.tgz", + "integrity": "sha512-TyeJ1zif53BPfHootBGwPRYT1RUt6oGWsaQr8UyZW/eAm9bKoijtvruSDEmZHm92CwS9nj7/fWttqPCgzep8CA==", "dev": true, "license": "MIT", "dependencies": { + "es-errors": "^1.3.0", "is-core-module": "^2.16.1", "path-parse": "^1.0.7", "supports-preserve-symlinks-flag": "^1.0.0" @@ -3571,23 +3237,6 @@ "node": ">=10" } }, - "node_modules/rimraf": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", - "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", - "deprecated": "Rimraf versions prior to v4 are no longer supported", - "dev": true, - "license": "ISC", - "dependencies": { - "glob": "^7.1.3" - }, - "bin": { - "rimraf": "bin.js" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, "node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -3787,33 +3436,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/tar": { - "version": "7.5.11", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.11.tgz", - "integrity": "sha512-ChjMH33/KetonMTAtpYdgUFr0tbz69Fp2v7zWxQfYZX4g5ZN2nOBXm1R2xyA+lMIKrLKIoKAwFj93jE/avX9cQ==", - "dev": true, - "license": "BlueOak-1.0.0", - "dependencies": { - "@isaacs/fs-minipass": "^4.0.0", - "chownr": "^3.0.0", - "minipass": "^7.1.2", - "minizlib": "^3.1.0", - "yallist": "^5.0.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/tar/node_modules/yallist": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", - "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/test-exclude": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz", @@ -3873,9 +3495,9 @@ } }, "node_modules/undici-types": { - "version": "7.18.2", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", - "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", + "version": "7.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.21.0.tgz", + "integrity": "sha512-w9IMgQrz4O0YN1LtB7K5P63vhlIOvC7opSmouCJ+ZywlPAlO9gIkJ+otk6LvGpAs2wg4econaCz3TvQ9xPoyuQ==", "dev": true, "license": "MIT" }, @@ -3935,20 +3557,6 @@ "makeerror": "1.0.12" } }, - "node_modules/wasm-pack": { - "version": "0.14.0", - "resolved": "https://registry.npmjs.org/wasm-pack/-/wasm-pack-0.14.0.tgz", - "integrity": "sha512-7uKj+483b6ETTnuWHK3zKNB3Ca3M159tPZ5shyXxI4j7i9Lk82rL2ck/L6E9O5VMWk9JgowdtTBOSfWmGBRFtw==", - "dev": true, - "hasInstallScript": true, - "license": "MIT OR Apache-2.0", - "dependencies": { - "binary-install": "^1.1.2" - }, - "bin": { - "wasm-pack": "run.js" - } - }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/crates/string-offsets/js/package.json b/crates/string-offsets/js/package.json index e41c2a2..26b7fa6 100644 --- a/crates/string-offsets/js/package.json +++ b/crates/string-offsets/js/package.json @@ -28,13 +28,6 @@ }, "devDependencies": { "@types/jest": "^29.5.14", - "jest": "^29.0.0", - "wasm-pack": "^0.14.0" - }, - "overrides": { - "binary-install": { - "tar": "^7.4.3", - "axios": "^1.8.0" - } + "jest": "^29.0.0" } }