-
Notifications
You must be signed in to change notification settings - Fork 16
Add sparse gram extraction #113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4426de9
9bb6a3c
4c70132
07bf579
929ea0a
6c6f387
85cfdf3
1cd9cbb
710102b
2521d57
8d62ad4
e36a92a
04b8cff
f57f67a
4a56922
5e41049
79f028e
e51ca63
16973f1
ba02311
fa563f7
0cbd36f
50307a8
6dd118c
e05a326
570f681
696c48a
674c816
a27a5b9
95bb71c
647dee1
3272bb1
295816d
feceb04
2521d02
d6bd09f
a540549
a0b80ea
48bfbbb
03cb4a7
cc73c69
8505846
6f750ac
2b57af8
703db7c
f7c1877
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| [package] | ||
| name = "sparse-ngrams" | ||
| version = "0.1.0" | ||
| edition = "2021" | ||
| description = "Fast sparse n-gram extraction from byte slices." | ||
| repository = "https://github.com/github/rust-gems" | ||
| license = "MIT" | ||
| keywords = ["ngram", "algorithm", "search", "index"] | ||
| categories = ["algorithms", "data-structures", "text-processing"] | ||
|
|
||
| [lib] | ||
| bench = false | ||
|
|
||
| [[bench]] | ||
| name = "performance" | ||
| path = "benchmarks/performance.rs" | ||
| harness = false | ||
|
|
||
| [dev-dependencies] | ||
| criterion = "0.7" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| # sparse-ngrams | ||
|
|
||
| Fast sparse n-gram extraction from byte slices. | ||
|
|
||
| Sparse grams select variable-length n-grams (2–8 bytes) without extracting all possible substrings. The algorithm is deterministic: the same extraction logic applies to every substring, making it suitable for substring search indexes. | ||
|
|
||
| For background, see: | ||
| - [The technology behind GitHub's new code search](https://github.blog/engineering/architecture-optimization/the-technology-behind-githubs-new-code-search/#fn-69904-bignote) | ||
| - [Sparse n-grams: smarter trigram selection](https://cursor.com/blog/fast-regex-search#sparse-n-grams-smarter-trigram-selection) | ||
|
|
||
| ## Caveats | ||
|
|
||
| The integrated bigram table contains only lowercase ASCII bigrams. Callers should lowercase and normalize input before extraction (e.g. fold uppercase to lowercase, map non-ASCII bytes to a single sentinel value). This makes the implementation suitable for case-insensitive search indexes. | ||
|
|
||
| ## How it works | ||
|
|
||
| Each consecutive byte pair (bigram) is assigned a frequency-based priority from a precomputed table. An n-gram boundary occurs wherever a bigram has lower priority than all bigrams between it and the previous boundary. This is computed efficiently using a monotone deque or a scan-based approach. | ||
|
|
||
| For a document of N bytes, this produces at most 3(N−1) n-grams: N−1 bigrams, plus up to 2(N−1) algorithmically selected longer n-grams (up to 8 bytes). | ||
|
|
||
| ### Selection criterion | ||
|
|
||
| A substring of length 3–8 is emitted as a sparse n-gram if and only if every interior bigram priority is strictly greater than the maximum of the left and right boundary bigram priorities. | ||
|
Comment on lines
+21
to
+23
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This section seems redundant. There was a github blog post about this that I think is worth linking. There's also the comment in
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kept it as a short introduction. added two links (to our own blog post and to cursors reimplementation) |
||
|
|
||
| ## Usage | ||
|
|
||
| ```rust | ||
| use sparse_ngrams::{collect_sparse_grams, NGram, MAX_SPARSE_GRAM_SIZE}; | ||
|
|
||
| let input = b"hello world"; | ||
| let grams = collect_sparse_grams(input); | ||
| for gram in &grams { | ||
| assert!(gram.len() >= 2); | ||
| assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize); | ||
| } | ||
| ``` | ||
|
|
||
| ## Performance | ||
|
|
||
| Benchmarks on an Apple M1 (15 KB input, `lib.rs` source file): | ||
|
|
||
| | Variant | Throughput | | ||
| |---------|-----------| | ||
| | `deque` | ~3.5 GB/s | | ||
| | `scan` | ~4.9 GB/s | | ||
|
|
||
| The `scan` variant is ~40% faster than the deque variant by replacing the monotone deque with a fixed-size circular buffer and a suffix-minimum scan. | ||
|
|
||
| ## Bigram table size | ||
|
|
||
| The priority table maps byte pairs to frequency-based priorities. Increasing the table size (number of ranked bigrams) produces more distinct longer n-grams, but saturates quickly: | ||
|
|
||
|  | ||
|
|
||
| | Table size | Unique n-grams | % of max | | ||
| |-----------|-----------------|----------| | ||
| | 100 | 5.8M | 77.0% | | ||
| | 200 | 6.4M | 84.4% | | ||
| | 400 | 6.8M | 90.2% | | ||
| | 800 | 7.3M | 96.0% | | ||
| | 1,600 | 7.5M | 99.2% | | ||
| | 3,200 | 7.6M | 99.9% | | ||
| | 5,845 | 7.6M | 100% | | ||
|
|
||
| The current bigram table contains the 5,845 most frequent bigrams from a large code corpus. | ||
| The table saturates quickly — the first ~1,600 bigrams already capture 99% of the unique n-grams. | ||
|
|
||
| ## Maximum n-gram length | ||
|
|
||
| Increasing the maximum n-gram length produces more unique longer grams, with diminishing returns: | ||
|
|
||
|  | ||
|
|
||
| | Max length | Unique n-grams | vs. len=8 | | ||
| |-----------|---------------|-----------| | ||
| | 2 | 1.2M | 16% | | ||
| | 3 | 4.1M | 54% | | ||
| | 4 | 5.3M | 70% | | ||
| | 6 | 6.8M | 89% | | ||
| | 8 | 7.6M | 100% | | ||
| | 12 | 8.5M | 113% | | ||
| | 16 | 9.1M | 120% | | ||
| | 24 | 9.7M | 128% | | ||
| | 32 | 10.1M | 133% | | ||
| | 48 | 10.4M | 137% | | ||
| | 64 | 10.5M | 139% | | ||
|
|
||
| The default of 8 captures most of the discriminative power. Going to 16 adds ~20% more unique grams but doubles the scan window; going to 64 adds only ~39% total. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; | ||
| use sparse_ngrams::{ | ||
| collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams, NGram, | ||
| }; | ||
|
|
||
| fn bench_collect(c: &mut Criterion) { | ||
| let inputs: Vec<(&str, Vec<u8>)> = vec![ | ||
| ("small_11B", b"hello world".to_vec()), | ||
| ( | ||
| "medium_900B", | ||
| "the quick brown fox jumps over the lazy dog. " | ||
| .repeat(20) | ||
| .into_bytes(), | ||
| ), | ||
| ( | ||
| "large_15KB", | ||
| include_str!("../src/lib.rs").as_bytes().to_vec(), | ||
| ), | ||
| ]; | ||
|
|
||
| let mut group = c.benchmark_group("collect"); | ||
| for (name, input) in &inputs { | ||
| let mut buf = vec![NGram::from_bytes(b"xx"); max_sparse_grams(input.len())]; | ||
| group.throughput(Throughput::Bytes(input.len() as u64)); | ||
|
|
||
| group.bench_with_input(BenchmarkId::new("deque", name), input, |b, input| { | ||
| b.iter(|| collect_sparse_grams_deque(black_box(input), &mut buf)) | ||
| }); | ||
| group.bench_with_input(BenchmarkId::new("scan", name), input, |b, input| { | ||
| b.iter(|| collect_sparse_grams_scan(black_box(input), &mut buf)) | ||
| }); | ||
| } | ||
| group.finish(); | ||
| } | ||
|
|
||
| criterion_group!(benches, bench_collect); | ||
| criterion_main!(benches); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| //! Stack-allocated circular buffer (monotone deque). | ||
|
|
||
| use std::mem::MaybeUninit; | ||
|
|
||
| /// Deque element representing two neighboring bytes in the input. | ||
| #[derive(Debug, Clone, Copy)] | ||
| pub(crate) struct PosStateBytes { | ||
| /// Absolute index position between the two bigram characters. | ||
| /// I.e. 1 references the very first bigram. | ||
| pub index: u32, | ||
| pub value: u16, | ||
| } | ||
|
|
||
| /// Stack-allocated circular buffer holding up to `CAP` elements. | ||
| /// Replaces `VecDeque<PosStateBytes>` — avoids heap allocation and fits in a | ||
| /// single cache line for small CAP values. | ||
| pub(crate) struct FixedDeque<const CAP: usize> { | ||
| data: [MaybeUninit<PosStateBytes>; CAP], | ||
| start: u8, | ||
| len: u8, | ||
| } | ||
|
Comment on lines
+17
to
+21
|
||
|
|
||
| impl<const CAP: usize> FixedDeque<CAP> { | ||
| pub fn new() -> Self { | ||
| Self { | ||
| data: [MaybeUninit::uninit(); CAP], | ||
| start: 0, | ||
| len: 0, | ||
| } | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn front(&self) -> Option<&PosStateBytes> { | ||
| if self.len == 0 { | ||
| None | ||
| } else { | ||
| Some(unsafe { self.data[self.start as usize].assume_init_ref() }) | ||
| } | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn back(&self) -> Option<&PosStateBytes> { | ||
| if self.len == 0 { | ||
| None | ||
| } else { | ||
| let idx = (self.start + self.len - 1) as usize % CAP; | ||
| Some(unsafe { self.data[idx].assume_init_ref() }) | ||
| } | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn pop_front(&mut self) { | ||
| debug_assert!(self.len > 0); | ||
| self.start = (self.start + 1) % CAP as u8; | ||
| self.len -= 1; | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn pop_back(&mut self) { | ||
| debug_assert!(self.len > 0); | ||
| self.len -= 1; | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn push_back(&mut self, val: PosStateBytes) { | ||
| debug_assert!((self.len as usize) < CAP); | ||
| let idx = (self.start + self.len) as usize % CAP; | ||
| self.data[idx] = MaybeUninit::new(val); | ||
| self.len += 1; | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.