diff --git a/Cargo.toml b/Cargo.toml index 312f46d..7547f1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "crates/*", "crates/bpe/benchmarks", "crates/bpe/tests", + "crates/hash-sorted-map/benchmarks", ] resolver = "2" diff --git a/crates/hash-sorted-map/Cargo.toml b/crates/hash-sorted-map/Cargo.toml index 84ffa02..6eac82f 100644 --- a/crates/hash-sorted-map/Cargo.toml +++ b/crates/hash-sorted-map/Cargo.toml @@ -8,3 +8,5 @@ repository = "https://github.com/github/rust-gems" license = "MIT" keywords = ["hashmap", "sorted", "merge", "simd"] categories = ["algorithms", "data-structures"] + +[dependencies] diff --git a/crates/hash-sorted-map/OPTIMIZATIONS.md b/crates/hash-sorted-map/OPTIMIZATIONS.md index 0b04520..9019582 100644 --- a/crates/hash-sorted-map/OPTIMIZATIONS.md +++ b/crates/hash-sorted-map/OPTIMIZATIONS.md @@ -4,8 +4,8 @@ `HashSortedMap` is a Swiss-table-inspired hash map that uses **overflow chaining** (instead of open addressing), **SIMD group scanning** (NEON/SSE2), -a **slot-hint fast path**, and an **optimized growth strategy**. It is generic -over key type, value type, and hash builder. +and an **optimized growth strategy**. It is generic over key type, value type, +and hash builder. This document analyzes the design trade-offs versus [hashbrown](https://github.com/rust-lang/hashbrown) and records the @@ -38,7 +38,6 @@ experimental results that guided the current design. │ • Overflow chaining (linked groups) │ │ • 8-byte groups with NEON/SSE2/scalar SIMD scan │ │ • EMPTY / FULL tag states only (insertion-only, no deletion) │ -│ • Slot-hint fast path │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -106,17 +105,33 @@ the overflow path. SIMD version** by pessimizing NEON code generation. Removed from the SIMD implementation, kept in the scalar version. -### 7. Slot Hint Fast Path (Unique to HashSortedMap) +### 7. Slot Hint Fast Path ❌ Removed -HashSortedMap checks a preferred slot before scanning the group: +Originally, HashSortedMap checked a preferred slot before scanning the group: ```rust let hint = slot_hint(hash); // 3 bits from hash → slot index if ctrl[hint] == EMPTY { /* direct insert */ } if ctrl[hint] == tag && keys[hint] == key { /* direct hit */ } ``` -hashbrown does **not** have this optimization — it always does a full SIMD -group scan. The reason why the performance is different is probably due to the different overflow strategies and the different load factors. +**Experimental finding**: This scalar check **hurts performance** on random +workloads. The branch predictor cannot help because random keys map to random +slots, making the hint check a 50/50 branch that pollutes the branch +predictor. SIMD-only scanning (match_tag + match_empty) is uniformly fast +regardless of key distribution. + +**Structural benefit of removal**: Without the slot hint, inserts always +append to the first empty slot. This guarantees that occupied slots are +**packed contiguously from the beginning** of each group (no gaps). This +invariant enables: +- `count_occupied()`: a single `leading_zeros()` on the ctrl word replaces + bitmask scanning to find the next free slot or count entries +- Simpler `insert_for_grow()`: just write at position `count_occupied()` +- Simpler iteration: occupied slots are always `0..count_occupied()` +- Simpler `sort_by_hash()`: no need to compact gaps before sorting + +**Current state**: Slot hint is fully removed. All paths use SIMD group +scanning for lookups and `count_occupied()` for finding the insertion point. ### 8. Overflow Reserve Sizing ✅ Validated @@ -159,13 +174,93 @@ entropy in both halves. Also changed trigram generation to use ## Summary of Impact -| Change | Effect on insert time | -|----------------------------|------------------------------| -| Capacity sizing fix | **−50%** (biggest win) | -| Optimized growth path | **−10%** on growth scenarios | -| SIMD group scanning | **−5%** | -| Branch hints (scalar only) | **−2–6%** | -| IdentityHasher fix | Enabled fair comparison | +| Change | Effect | +|---------------------------------|-------------------------------------| +| Capacity sizing fix | **−50%** insert time (biggest win) | +| Optimized growth path | **2× faster** growth than hashbrown | +| SIMD group scanning | **−5%** insert time | +| Slot hint removal | **−25%** merge latency, contiguous packing | +| Branch hints (scalar only) | **−2–6%** | +| IdentityHasher fix | Enabled fair comparison | -The current HashSortedMap **matches hashbrown+FxHash** on pre-sized inserts, -**beats all hashbrown variants** on overwrites, and has **2× faster growth**. +--- + +## Benchmark Results (local x86_64 snapshot) + +Hardware used for the current local snapshot: + +- CPU: Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +- Architecture: x86_64 +- Topology: 1 socket, 1 core, 2 threads +- CPU frequency range: 800 MHz to 2800 MHz +- Memory: 7.8 GiB RAM + +### Insert (1000 trigrams, pre-sized) + +| Implementation | Time (µs) | vs hashbrown | +|----------------------|-----------|--------------| +| FoldHashMap | 13.88 | −5% | +| FxHashMap | 14.60 | ~0% | +| hashbrown+Identity | 14.44 | baseline | +| hashbrown::HashMap | 14.55 | +1% | +| std::HashMap+FNV | 15.55 | +8% | +| AHashMap | 15.59 | +8% | +| **HashSortedMap** | **9.40** | **−35%** | +| std::HashMap | 25.26 | +75% | + +### Reinsert (1000 trigrams, all keys exist) + +| Implementation | Time (µs) | +|----------------------|-----------| +| **HashSortedMap** | **6.59** | +| hashbrown+Identity | 6.95 | + +### Growth (128 → 1000 trigrams, 3 resize rounds) + +| Implementation | Time (µs) | +|----------------------|-----------| +| hashbrown+Identity | 26.66 | +| **HashSortedMap** | **27.50** | + +### Count (4000 trigrams, mixed insert/update) + +| Implementation | Time (µs) | +|----------------------------------|-----------| +| hashbrown+Identity entry() | 15.49 | +| **HashSortedMap get_or_default** | **15.88** | +| **HashSortedMap entry().or_default()** | **16.15** | + +### Iteration (1000 trigrams) + +| Implementation | Time (µs) | +|-------------------------------|-----------| +| **HashSortedMap iter()** | **3.02** | +| hashbrown+Identity iter() | 3.04 | +| **HashSortedMap into_iter()** | **3.03** | +| hashbrown+Identity into_iter()| 3.56 | + +### Sort (100K trigrams) + +| Implementation | Time (ms) | +|-----------------------------|-----------| +| **HashSortedMap sort_by_hash** | **1.66** | +| Vec::sort_unstable | 2.20 | + +### Merge (100 maps × 100K keys each → sorted output) + +| Implementation | Time (ms) | vs HSM merge+sort | +|-----------------------------------|-----------|--------------------| +| hashbrown merge presized | 160.79 | +6% | +| **HashSortedMap merge presized** | **117.01**| **−23%** | +| **HashSortedMap merge (no sort)** | **141.57**| **−7%** | +| hashbrown merge | 163.59 | +7% | +| **HashSortedMap merge + sort** | **152.34**| **baseline** | +| hashbrown merge + Vec sort | 193.37 | +27% | +| k-way merge sorted vecs | 445 | +192% | + +**Key takeaways:** +- Pre-sized insert is **~35% faster** than hashbrown+Identity +- Reinsert and iter paths are now close to parity with hashbrown+Identity +- Growth path is currently **~3% slower** than hashbrown+Identity +- sort_by_hash is **~24% faster** than Vec::sort_unstable +- merge + sort is **~21% faster** than hashbrown merge + Vec sort diff --git a/crates/hash-sorted-map/README.md b/crates/hash-sorted-map/README.md index ebd5ef6..bbf6e3b 100644 --- a/crates/hash-sorted-map/README.md +++ b/crates/hash-sorted-map/README.md @@ -29,8 +29,8 @@ keys, which means: - **Overflow chaining** instead of open addressing — groups that fill up link to overflow groups rather than probing into neighbours. -- **Slot hint** — a preferred slot index derived from the hash, checked before - scanning the group. Gives a direct hit on most inserts at low load. +- **Contiguous packing** — occupied slots are always packed from position 0 + with no gaps, enabling a single `leading_zeros()` to find the next free slot. - **SIMD group scanning** — uses NEON on aarch64, SSE2 on x86\_64, and a scalar fallback elsewhere to scan 8–16 control bytes in parallel. - **AoS group layout** — each group stores its control bytes, keys, and values @@ -42,45 +42,32 @@ keys, which means: ## Benchmark results -All benchmarks insert 1000 random trigram hashes (scrambled with -`folded_multiply`) into maps with various configurations. Measured on Apple -M-series (aarch64). - -### Insert 1000 trigrams — pre-sized, no growth - -| Rank | Map | Time (µs) | vs best | -|------|-----|-----------|---------| -| 🥇 | FoldHashMap | 2.44 | — | -| 🥈 | FxHashMap | 2.61 | +7% | -| 🥉 | hashbrown::HashMap | 2.67 | +9% | -| 4 | **HashSortedMap** | **2.71** | +11% | -| 5 | hashbrown+Identity | 2.74 | +12% | -| 6 | std::HashMap+FNV | 3.27 | +34% | -| 7 | AHashMap | 3.22 | +32% | -| 8 | std::HashMap | 8.49 | +248% | - -### Re-insert same keys (all overwrites) - -| Map | Time (µs) | -|-----|-----------| -| **HashSortedMap** | **2.36** ✅ | -| hashbrown+Identity | 2.58 | - -### Growth from small (`with_capacity(128)`, 3 resize rounds) - -| Map | Time (µs) | Growth penalty | -|-----|-----------|----------------| -| **HashSortedMap** | **4.85** | +2.14 | -| hashbrown+Identity | 9.77 | +7.03 | - -### Key takeaways - -- **HashSortedMap matches the fastest hashbrown configurations** on pre-sized - first-time inserts and is **the fastest for overwrites**. -- **Growth is ~2× faster** than hashbrown thanks to the optimized - `insert_for_grow` path that skips duplicate checking and uses raw copies. -- The remaining gap to FoldHashMap (~11%) comes from foldhash's extremely - efficient hash function that pipelines well with hashbrown's SIMD scan. +Latest local Criterion snapshot from this repository's +`target/criterion` outputs (lower is better): + +Hardware used for this snapshot: + +- CPU: Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +- Architecture: x86_64 +- Topology: 1 socket, 1 core, 2 threads +- CPU frequency range: 800 MHz to 2800 MHz +- Memory: 7.8 GiB RAM + +| Scenario | HashSortedMap | Comparison | Result | +| :------------------------------------------- | ------------: | :------------------------------------- | :---------- | +| Insert 1000 trigrams (pre-sized) | 9.40 µs | hashbrown::HashMap: 14.55 µs | ~35% faster | +| Grow from capacity 128 | 27.50 µs | hashbrown+Identity: 26.66 µs | ~3% slower | +| Count 4000 trigrams (`entry().or_default()`) | 16.15 µs | hashbrown+Identity `entry()`: 15.49 µs | ~4% slower | +| Iterate 1000 trigrams (`iter()`) | 3.02 µs | hashbrown+Identity `iter()`: 3.04 µs | ~1% faster | +| Sort 100000 trigrams by hash | 1.66 ms | `Vec::sort_unstable`: 2.20 ms | ~24% faster | +| Merge 100 sorted maps + final sort | 152.34 ms | hashbrown merge + vec sort: 193.37 ms | ~21% faster | + +Key takeaways: + +- Pre-sized inserts, sorting, and merge+sort remain the strongest paths. +- Iteration is now roughly on par with `hashbrown+Identity`. +- Growth and count/update workloads are currently slightly slower than + `hashbrown+Identity` in this run. ## Running diff --git a/crates/hash-sorted-map/benchmarks/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml index 9ee37dc..91019a4 100644 --- a/crates/hash-sorted-map/benchmarks/Cargo.toml +++ b/crates/hash-sorted-map/benchmarks/Cargo.toml @@ -21,3 +21,4 @@ ahash = "0.8" hashbrown = "0.15" foldhash = "0.1" fnv = "1" +itertools = "0.14" diff --git a/crates/hash-sorted-map/benchmarks/performance.rs b/crates/hash-sorted-map/benchmarks/performance.rs index 5a04801..07cfc2f 100644 --- a/crates/hash-sorted-map/benchmarks/performance.rs +++ b/crates/hash-sorted-map/benchmarks/performance.rs @@ -1,6 +1,9 @@ +use std::hash::BuildHasher; + use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; use hash_sorted_map::HashSortedMap; -use hash_sorted_map_benchmarks::{random_trigram_hashes, IdentityBuildHasher}; +use hash_sorted_map_benchmarks::{folded_multiply, random_trigram_hashes, IdentityBuildHasher}; +use rand::RngExt; fn trigrams() -> Vec { random_trigram_hashes(1000) @@ -291,11 +294,311 @@ fn bench_count(c: &mut Criterion) { group.finish(); } +fn bench_iter(c: &mut Criterion) { + let trigrams = trigrams(); + + let mut group = c.benchmark_group("iter_1000_trigrams"); + + group.bench_function("hashbrown+Identity iter()", |b| { + b.iter_batched( + || { + let mut map = + hashbrown::HashMap::::with_capacity_and_hasher( + trigrams.len(), + Default::default(), + ); + for (i, &key) in trigrams.iter().enumerate() { + map.insert(key, i); + } + map + }, + |map| { + let mut sum = 0usize; + for (&k, &v) in &map { + sum = sum.wrapping_add(v).wrapping_add(k as usize); + } + sum + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("HashSortedMap iter()", |b| { + b.iter_batched( + || { + let mut map = HashSortedMap::with_capacity_and_hasher( + trigrams.len(), + IdentityBuildHasher::default(), + ); + for (i, &key) in trigrams.iter().enumerate() { + map.insert(key, i); + } + map + }, + |map| { + let mut sum = 0usize; + for (&k, &v) in &map { + sum = sum.wrapping_add(v).wrapping_add(k as usize); + } + sum + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("hashbrown+Identity into_iter()", |b| { + b.iter_batched( + || { + let mut map = + hashbrown::HashMap::::with_capacity_and_hasher( + trigrams.len(), + Default::default(), + ); + for (i, &key) in trigrams.iter().enumerate() { + map.insert(key, i); + } + map + }, + |map| { + let mut sum = 0usize; + for (k, v) in map { + sum = sum.wrapping_add(v).wrapping_add(k as usize); + } + sum + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("HashSortedMap into_iter()", |b| { + b.iter_batched( + || { + let mut map = HashSortedMap::with_capacity_and_hasher( + trigrams.len(), + IdentityBuildHasher::default(), + ); + for (i, &key) in trigrams.iter().enumerate() { + map.insert(key, i); + } + map + }, + |map| { + let mut sum = 0usize; + for (k, v) in map { + sum = sum.wrapping_add(v).wrapping_add(k as usize); + } + sum + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +fn bench_sort(c: &mut Criterion) { + let keys = random_trigram_hashes(100_000); + let hasher = IdentityBuildHasher::default(); + let mut group = c.benchmark_group("sort_100000_trigrams"); + + group.bench_function("Vec::sort_unstable", |b| { + b.iter(|| { + let mut vec: Vec<_> = keys.iter().enumerate().map(|(i, &key)| (key, i)).collect(); + vec.sort_unstable_by(|a, b| { + let ha = hasher.hash_one(a.0); + let hb = hasher.hash_one(b.0); + (ha, a.0).cmp(&(hb, b.0)) + }); + vec + }); + }); + + group.bench_function("HashSortedMap sort_by_hash", |b| { + b.iter(|| { + let mut map = + HashSortedMap::with_capacity_and_hasher(keys.len(), IdentityBuildHasher::default()); + for (i, &key) in keys.iter().enumerate() { + map.insert(key, i); + } + map.sort_by_hash(); + map + }); + }); + + group.finish(); +} + +fn bench_merge_sort(c: &mut Criterion) { + const NUM_MAPS: usize = 100; + const KEYS_PER_MAP: usize = 100_000; + + // Pre-generate 100 key vectors with random u32 values scrambled via folded_multiply. + let maps_data: Vec> = (0..NUM_MAPS) + .map(|_| { + let mut rng = rand::rng(); + (0..KEYS_PER_MAP) + .map(|_| { + folded_multiply(rng.random_range(0..1_000_000u32) as u64, 0x243f6a8885a308d3) + as u32 + }) + .collect() + }) + .collect(); + + // Pre-build sorted containers from the input data. + let hash_maps: Vec<_> = maps_data + .into_iter() + .map(|keys| { + let mut map = HashSortedMap::with_hasher(IdentityBuildHasher::default()); + for key in keys { + *map.entry(key).or_default() += 1u32; + } + map + }) + .collect(); + + let hasher = IdentityBuildHasher::default(); + let mut group = c.benchmark_group("merge_100_maps_sorted"); + group.sample_size(10); + + // ── 1. HashSortedMap: merge sorted containers, then sort_by_hash ─ + group.bench_function("HashSortedMap merge + sort_by_hash", |b| { + b.iter(|| { + let mut map: HashSortedMap = + HashSortedMap::with_hasher(IdentityBuildHasher::default()); + for container in &hash_maps { + for (&key, &value) in container { + *map.entry(key).or_default() += value; + } + } + map.sort_by_hash(); + map + }); + }); + + // ── 2. K-way merge over pre-sorted containers ──────────────────── + group.bench_function("k-way merge sorted containers", |b| { + use itertools::Itertools; + + b.iter(|| { + // Phase 1: build per-container sorted (hash, key, count) vectors. + let sorted_vecs: Vec> = hash_maps + .iter() + .map(|container| { + let mut vec: Vec<(u64, u32, u32)> = container + .iter() + .map(|(&k, &v)| (hasher.hash_one(k), k, v)) + .collect(); + vec.sort_unstable_by_key(|&(h, _, _)| h); + vec + }) + .collect(); + + // Phase 2: k-merge + group_by to aggregate counts. + let result: Vec<(u32, u32)> = sorted_vecs + .into_iter() + .map(|v| v.into_iter()) + .kmerge_by(|a, b| (a.0, a.1) <= (b.0, b.1)) + .chunk_by(|&(_, key, _)| key) + .into_iter() + .map(|(key, group)| (key, group.map(|(_, _, c)| c).sum())) + .collect(); + result + }); + }); + + // ── 3. hashbrown HashMap merge, then sort into Vec ────────────── + group.bench_function("hashbrown merge + Vec sort", |b| { + b.iter(|| { + let mut map = hashbrown::HashMap::::with_hasher( + IdentityBuildHasher::default(), + ); + for container in &hash_maps { + for (&key, &value) in container { + *map.entry(key).or_default() += value; + } + } + let mut vec: Vec<(u32, u32)> = map.into_iter().collect(); + vec.sort_unstable_by(|a, b| { + let ha = hasher.hash_one(a.0); + let hb = hasher.hash_one(b.0); + (ha, a.0).cmp(&(hb, b.0)) + }); + vec + }); + }); + + // ── 4. hashbrown HashMap merge only (no sort) ─────────────────── + group.bench_function("hashbrown merge", |b| { + b.iter(|| { + let mut map = hashbrown::HashMap::::with_hasher( + IdentityBuildHasher::default(), + ); + for container in &hash_maps { + for (&key, &value) in container { + *map.entry(key).or_default() += value; + } + } + map + }); + }); + + // ── 5. HashSortedMap merge only (no sort) ─────────────────────── + group.bench_function("HashSortedMap merge", |b| { + b.iter(|| { + let mut map: HashSortedMap = + HashSortedMap::with_hasher(IdentityBuildHasher::default()); + for container in &hash_maps { + for (&key, &value) in container { + *map.entry(key).or_default() += value; + } + } + map + }); + }); + + // ── 6. hashbrown presized merge only ──────────────────────────── + group.bench_function("hashbrown merge presized", |b| { + b.iter(|| { + let mut map = + hashbrown::HashMap::::with_capacity_and_hasher( + 1_000_000, + IdentityBuildHasher::default(), + ); + for container in &hash_maps { + for (&key, &value) in container { + *map.entry(key).or_default() += value; + } + } + map + }); + }); + + // ── 7. HashSortedMap presized merge only ───────────────────────── + group.bench_function("HashSortedMap merge presized", |b| { + b.iter(|| { + let mut map: HashSortedMap = + HashSortedMap::with_capacity_and_hasher(1_000_000, IdentityBuildHasher::default()); + for container in &hash_maps { + for (&key, &value) in container { + *map.entry(key).or_default() += value; + } + } + map + }); + }); + + group.finish(); +} + criterion_group!( benches, bench_insert, bench_reinsert, bench_grow, - bench_count + bench_count, + bench_iter, + bench_sort, + bench_merge_sort ); criterion_main!(benches); diff --git a/crates/hash-sorted-map/src/group.rs b/crates/hash-sorted-map/src/group.rs new file mode 100644 index 0000000..c1ba315 --- /dev/null +++ b/crates/hash-sorted-map/src/group.rs @@ -0,0 +1,23 @@ +use core::mem::MaybeUninit; + +use super::group_ops::{CTRL_EMPTY, GROUP_SIZE}; + +pub(crate) const NO_OVERFLOW: u32 = u32::MAX; + +pub(crate) struct Group { + pub(crate) ctrl: [u8; GROUP_SIZE], + pub(crate) keys: [MaybeUninit; GROUP_SIZE], + pub(crate) values: [MaybeUninit; GROUP_SIZE], + pub(crate) overflow: u32, +} + +impl Group { + pub(crate) fn new() -> Self { + Self { + ctrl: [CTRL_EMPTY; GROUP_SIZE], + keys: [const { MaybeUninit::uninit() }; GROUP_SIZE], + values: [const { MaybeUninit::uninit() }; GROUP_SIZE], + overflow: NO_OVERFLOW, + } + } +} diff --git a/crates/hash-sorted-map/src/group_ops.rs b/crates/hash-sorted-map/src/group_ops.rs index a1b92ec..cc14813 100644 --- a/crates/hash-sorted-map/src/group_ops.rs +++ b/crates/hash-sorted-map/src/group_ops.rs @@ -38,31 +38,11 @@ mod arch { } } - #[inline(always)] - pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask { - match_tag(ctrl, super::CTRL_EMPTY) - } - - /// Mask of slots whose ctrl byte has the high bit set (occupied). - /// Uses SSE2 `_mm_movemask_epi8` which extracts the top bit of each byte. - #[inline(always)] - pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask { - unsafe { - let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i); - x86::_mm_movemask_epi8(group) as u32 - } - } - #[inline(always)] pub fn lowest(mask: Mask) -> usize { mask.trailing_zeros() as usize } - #[inline(always)] - pub fn clear_slot(mask: Mask, slot: usize) -> Mask { - mask & !(1u32 << slot) - } - #[inline(always)] pub fn next_match(mask: &mut Mask) -> Option { if *mask == 0 { @@ -72,6 +52,13 @@ mod arch { *mask &= *mask - 1; Some(i) } + + /// Number of trailing occupied (non-zero) bytes in the ctrl array. + #[inline(always)] + pub fn count_occupied(ctrl: &[u8; GROUP_SIZE]) -> usize { + let word = u128::from_ne_bytes(*ctrl); + GROUP_SIZE - (word.leading_zeros() / 8) as usize + } } #[cfg(target_arch = "aarch64")] @@ -89,34 +76,11 @@ mod arch { } } - #[inline(always)] - pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask { - unsafe { - let group = neon::vld1_u8(ctrl.as_ptr()); - let cmp = neon::vceq_u8(group, neon::vdup_n_u8(0)); - neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080 - } - } - - /// Mask of slots whose ctrl byte has the high bit set (occupied). - #[inline(always)] - pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask { - unsafe { - let group = neon::vld1_u8(ctrl.as_ptr()); - neon::vget_lane_u64(neon::vreinterpret_u64_u8(group), 0) & 0x8080808080808080 - } - } - #[inline(always)] pub fn lowest(mask: Mask) -> usize { (mask.trailing_zeros() >> 3) as usize } - #[inline(always)] - pub fn clear_slot(mask: Mask, slot: usize) -> Mask { - mask & !(0x80u64 << (slot * 8)) - } - #[inline(always)] pub fn next_match(mask: &mut Mask) -> Option { if *mask == 0 { @@ -126,6 +90,13 @@ mod arch { *mask &= *mask - 1; Some(i) } + + /// Number of trailing occupied (non-zero) bytes in the ctrl array. + #[inline(always)] + pub fn count_occupied(ctrl: &[u8; GROUP_SIZE]) -> usize { + let word = u64::from_ne_bytes(*ctrl); + GROUP_SIZE - (word.leading_zeros() / 8) as usize + } } #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] @@ -140,29 +111,11 @@ mod arch { (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080 } - #[inline(always)] - pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask { - let word = u64::from_ne_bytes(*ctrl); - !word & 0x8080808080808080 - } - - /// Mask of slots whose ctrl byte has the high bit set (occupied). - #[inline(always)] - pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask { - let word = u64::from_ne_bytes(*ctrl); - word & 0x8080808080808080 - } - #[inline(always)] pub fn lowest(mask: Mask) -> usize { (mask.trailing_zeros() >> 3) as usize } - #[inline(always)] - pub fn clear_slot(mask: Mask, slot: usize) -> Mask { - mask & !(0x80u64 << (slot * 8)) - } - #[inline(always)] pub fn next_match(mask: &mut Mask) -> Option { if *mask == 0 { @@ -172,6 +125,13 @@ mod arch { *mask &= *mask - 1; Some(i) } + + /// Number of trailing occupied (non-zero) bytes in the ctrl array. + #[inline(always)] + pub fn count_occupied(ctrl: &[u8; GROUP_SIZE]) -> usize { + let word = u64::from_ne_bytes(*ctrl); + GROUP_SIZE - (word.leading_zeros() / 8) as usize + } } pub use arch::*; diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs index 26a4ecd..0cc37b4 100644 --- a/crates/hash-sorted-map/src/hash_sorted_map.rs +++ b/crates/hash-sorted-map/src/hash_sorted_map.rs @@ -4,9 +4,10 @@ use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hash}; use std::marker::PhantomData; +use super::group::Group; use super::group_ops::{self, CTRL_EMPTY, GROUP_SIZE}; -const NO_OVERFLOW: u32 = u32::MAX; +pub(crate) use super::group::NO_OVERFLOW; // ── Helpers ───────────────────────────────────────────────────────────────── @@ -15,38 +16,19 @@ fn tag(hash: u64) -> u8 { (hash as u8) | 0x80 } -#[inline] -fn slot_hint(hash: u64) -> usize { - ((hash >> 7) & (GROUP_SIZE as u64 - 1)) as usize -} - -struct Group { - ctrl: [u8; GROUP_SIZE], - keys: [MaybeUninit; GROUP_SIZE], - values: [MaybeUninit; GROUP_SIZE], - overflow: u32, -} - -impl Group { - fn new() -> Self { - Self { - ctrl: [CTRL_EMPTY; GROUP_SIZE], - keys: [const { MaybeUninit::uninit() }; GROUP_SIZE], - values: [const { MaybeUninit::uninit() }; GROUP_SIZE], - overflow: NO_OVERFLOW, - } - } -} +// ──────────────────────────────────────────────────────────────────────── +// HashSortedMap +// ──────────────────────────────────────────────────────────────────────── /// Insertion-only hash map with SIMD group scanning. /// /// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere. /// Generic over key type `K`, value type `V`, and hash builder `S`. pub struct HashSortedMap { - groups: Box<[Group]>, - num_groups: u32, - n_bits: u32, - len: usize, + pub(crate) groups: Box<[Group]>, + pub(crate) num_groups: u32, + pub(crate) n_bits: u32, + pub(crate) len: usize, hash_builder: S, } @@ -75,19 +57,24 @@ impl HashSortedMap { let adjusted = (capacity as f64 / group_ops::MAX_FILL).ceil() as usize; let min_groups = (adjusted.div_ceil(GROUP_SIZE)).max(1).next_power_of_two(); let n_bits = min_groups.trailing_zeros().max(1); - let (groups, num_primary) = Self::alloc_groups(n_bits); + let (groups, num_groups) = Self::alloc_groups(n_bits); Self { groups, - num_groups: num_primary, + num_groups, n_bits, len: 0, hash_builder, } } - /// Allocate a fully default-initialized boxed slice sized for `n_bits` primary groups - /// plus the standard 12.5% overflow reserve. Returns the slice and the number of - /// primary groups (which is also the initial in-use count). + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + fn alloc_groups(n_bits: u32) -> (Box<[Group]>, u32) { let num_primary = 1usize << n_bits; let total = num_primary + num_primary / 8 + 1; @@ -97,16 +84,123 @@ impl HashSortedMap { } #[inline] - fn group_index(&self, hash: u64) -> usize { + pub(crate) fn group_index(&self, hash: u64) -> usize { (hash >> (64 - self.n_bits)) as usize } +} - pub fn len(&self) -> usize { - self.len - } +impl HashSortedMap { + /// Sort all entries within each primary group chain by their hash value, + /// breaking ties by key. + /// + /// After sorting, iteration visits entries in hash order within each + /// primary group (and since primary groups are visited in group-index + /// order, the overall iteration is in full hash order). + /// + /// # Complexity + /// + /// Each of `n` elements hashes uniformly into one of `m` primary groups, + /// so chain lengths follow `X_i ~ Binomial(n, 1/m)` with `E[X_i] = n/m`. + /// With a quadratic sort per chain the total expected cost is: + /// + /// ```text + /// Σ E[X_i²] = m · (Var[X_i] + E[X_i]²) + /// = m · (n/m · (1 − 1/m) + n²/m²) + /// = n · (1 − 1/m) + n²/m + /// ``` + /// + /// Dividing by `n` gives the expected cost per element: `1 + n/m` (for + /// `m ≫ 1`). Since `n/m` is the average chain length, bounded by + /// `GROUP_SIZE / MAX_FILL`, the per-element cost stays constant. + pub fn sort_by_hash(&mut self) { + let num_primary = 1usize << self.n_bits; + let mut chain: Vec = Vec::new(); + let mut hashes: Vec = Vec::new(); + + for primary_gi in 0..num_primary { + chain.clear(); + hashes.clear(); + + // Collect group indices in this chain. + let mut gi = primary_gi; + loop { + chain.push(gi as u32); + let overflow = self.groups[gi].overflow; + if overflow == NO_OVERFLOW { + break; + } + gi = overflow as usize; + } + // All groups before the last are fully packed (overflow is only + // allocated when the previous group is full). Compute hashes for + // those directly. + for &cgi in &chain[..chain.len() - 1] { + let g = &self.groups[cgi as usize]; + for slot in 0..GROUP_SIZE { + let hash = self + .hash_builder + .hash_one(unsafe { g.keys[slot].assume_init_ref() }); + hashes.push(hash); + } + } + let g = + &self.groups[*chain.last().expect("chain should have at least one group") as usize]; + for slot in 0..GROUP_SIZE { + if g.ctrl[slot] == CTRL_EMPTY { + break; + } + let hash = self + .hash_builder + .hash_one(unsafe { g.keys[slot].assume_init_ref() }); + hashes.push(hash); + } - pub fn is_empty(&self) -> bool { - self.len == 0 + let n = hashes.len(); + // Insertion sort by (hash, key). + for i in 1..n { + // Extract element at position i. + let cur_hash = hashes[i]; + let (gi, si) = chain_slot(&chain, i); + let cur_key = unsafe { self.groups[gi].keys[si].assume_init_read() }; + let cur_val = unsafe { self.groups[gi].values[si].assume_init_read() }; + // Find insertion point via linear scan backward. + let mut j = i; + while j > 0 { + let (gj, sj) = chain_slot(&chain, j - 1); + let prev_key = unsafe { self.groups[gj].keys[sj].assume_init_ref() }; + if (hashes[j - 1], prev_key) <= (cur_hash, &cur_key) { + break; + } + j -= 1; + } + if j < i { + // Shift positions j..i up by one. + hashes.copy_within(j..i, j + 1); + for pos in (j..i).rev() { + let (src_g, src_s) = chain_slot(&chain, pos); + let (dst_g, dst_s) = chain_slot(&chain, pos + 1); + unsafe { + let k = std::ptr::read(&self.groups[src_g].keys[src_s]); + let v = std::ptr::read(&self.groups[src_g].values[src_s]); + self.groups[dst_g].keys[dst_s] = k; + self.groups[dst_g].values[dst_s] = v; + } + } + } + // Insert at position j (or write back to i if already in place). + hashes[j] = cur_hash; + let (gj, sj) = chain_slot(&chain, j); + self.groups[gj].keys[sj] = MaybeUninit::new(cur_key); + self.groups[gj].values[sj] = MaybeUninit::new(cur_val); + } + // Rebuild ctrl/tag bytes from the sorted hashes so that + // get/insert/entry still work after sorting. + // This adds a small performance penalty of maybe 6%. + for (pos, &h) in hashes.iter().enumerate() { + let (gi, si) = chain_slot(&chain, pos); + self.groups[gi].ctrl[si] = tag(h); + } + } } } @@ -162,26 +256,11 @@ impl HashSortedMap { fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option { let tag = tag(hash); - let hint = slot_hint(hash); let mut gi = self.group_index(hash); loop { let group = &mut self.groups[gi]; - // Fast path: check preferred slot. - let c = group.ctrl[hint]; - if c == CTRL_EMPTY { - group.ctrl[hint] = tag; - group.keys[hint] = MaybeUninit::new(key); - group.values[hint] = MaybeUninit::new(value); - self.len += 1; - return None; - } - if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key { - let old = std::mem::replace(unsafe { group.values[hint].assume_init_mut() }, value); - return Some(old); - } - // Slow path: SIMD scan group for tag match. + // SIMD scan group for tag match. let mut tag_mask = group_ops::match_tag(&group.ctrl, tag); - tag_mask = group_ops::clear_slot(tag_mask, hint); while let Some(i) = group_ops::next_match(&mut tag_mask) { if unsafe { group.keys[i].assume_init_ref() } == &key { let old = @@ -190,12 +269,11 @@ impl HashSortedMap { } } // Check for empty slot in this group. - let empty_mask = group_ops::match_empty(&group.ctrl); - if empty_mask != 0 { - let i = group_ops::lowest(empty_mask); - group.ctrl[i] = tag; - group.keys[i] = MaybeUninit::new(key); - group.values[i] = MaybeUninit::new(value); + let occupied_slots = group_ops::count_occupied(&group.ctrl); + if occupied_slots != GROUP_SIZE { + group.ctrl[occupied_slots] = tag; + group.keys[occupied_slots] = MaybeUninit::new(key); + group.values[occupied_slots] = MaybeUninit::new(value); self.len += 1; return None; } @@ -214,9 +292,9 @@ impl HashSortedMap { self.num_groups += 1; self.groups[gi].overflow = new_gi as u32; let group = &mut self.groups[new_gi]; - group.ctrl[hint] = tag; - group.keys[hint] = MaybeUninit::new(key); - group.values[hint] = MaybeUninit::new(value); + group.ctrl[0] = tag; + group.keys[0] = MaybeUninit::new(key); + group.values[0] = MaybeUninit::new(value); self.len += 1; return None; } @@ -229,31 +307,20 @@ impl HashSortedMap { Q: Eq + ?Sized, { let tag = tag(hash); - let hint = slot_hint(hash); let mut gi = self.group_index(hash); loop { let group = &self.groups[gi]; - - // Fast path: preferred slot. - let c = group.ctrl[hint]; - if c == tag && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key { - return Some(unsafe { group.values[hint].assume_init_ref() }); - } - - // Slow path: SIMD scan group. + // SIMD scan group for tag match. let mut tag_mask = group_ops::match_tag(&group.ctrl, tag); - tag_mask = group_ops::clear_slot(tag_mask, hint); while let Some(i) = group_ops::next_match(&mut tag_mask) { if unsafe { group.keys[i].assume_init_ref() }.borrow() == key { return Some(unsafe { group.values[i].assume_init_ref() }); } } - - if group_ops::match_empty(&group.ctrl) != 0 { + if group.ctrl[GROUP_SIZE - 1] == CTRL_EMPTY { return None; } - if group.overflow == NO_OVERFLOW { return None; } @@ -269,43 +336,26 @@ impl HashSortedMap { /// of `&mut self` until any reallocation (`grow`). fn find_or_insertion_slot(&mut self, hash: u64, key: &K) -> FindResult { let tag = tag(hash); - let hint = slot_hint(hash); let mut gi = self.group_index(hash); loop { let group = &mut self.groups[gi]; - // Fast path: preferred slot. - let c = group.ctrl[hint]; - if c == CTRL_EMPTY { - return FindResult::Vacant(Insertion::Empty { - group: group as *mut _, - slot: hint, - }); - } - if c == tag && unsafe { group.keys[hint].assume_init_ref() } == key { - return FindResult::Found(group.values[hint].as_mut_ptr()); - } - - // Slow path: SIMD scan group for tag match. + // SIMD scan group for tag match. let mut tag_mask = group_ops::match_tag(&group.ctrl, tag); - tag_mask = group_ops::clear_slot(tag_mask, hint); while let Some(i) = group_ops::next_match(&mut tag_mask) { if unsafe { group.keys[i].assume_init_ref() } == key { return FindResult::Found(group.values[i].as_mut_ptr()); } } - // Check for empty slot in this group. - let empty_mask = group_ops::match_empty(&group.ctrl); - if empty_mask != 0 { - let i = group_ops::lowest(empty_mask); + let occupied_slots = group_ops::count_occupied(&group.ctrl); + if occupied_slots != GROUP_SIZE { return FindResult::Vacant(Insertion::Empty { group: group as *mut _, - slot: i, + slot: occupied_slots, }); } - // Group full — follow or report end of chain. if group.overflow == NO_OVERFLOW { return FindResult::Vacant(Insertion::NeedsOverflow { @@ -331,8 +381,7 @@ impl HashSortedMap { self.len = 0; for group in &old_groups[..old_num_groups] { - let mut full_mask = group_ops::match_full(&group.ctrl); - while let Some(i) = group_ops::next_match(&mut full_mask) { + for i in 0..group_ops::count_occupied(&group.ctrl) { let hash = self .hash_builder .hash_one(unsafe { group.keys[i].assume_init_ref() }); @@ -348,18 +397,13 @@ impl HashSortedMap { fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) { let tag = tag(hash); - let mut hint = slot_hint(hash); let gi = self.group_index(hash); let mut group = &mut self.groups[gi]; - loop { - if group.ctrl[hint] == CTRL_EMPTY { - break; - } - let empty_mask = group_ops::match_empty(&group.ctrl); - if empty_mask != 0 { - hint = group_ops::lowest(empty_mask); - break; + let slot = loop { + let occupied = group_ops::count_occupied(&group.ctrl); + if occupied != GROUP_SIZE { + break occupied; } let overflow = group.overflow; if overflow != NO_OVERFLOW { @@ -369,15 +413,15 @@ impl HashSortedMap { group.overflow = new_gi as u32; self.num_groups += 1; group = &mut self.groups[new_gi]; - break; + break 0; } - } - group.ctrl[hint] = tag; + }; + group.ctrl[slot] = tag; unsafe { - group.keys[hint] + group.keys[slot] .as_mut_ptr() .copy_from_nonoverlapping(key_src, 1); - group.values[hint] + group.values[slot] .as_mut_ptr() .copy_from_nonoverlapping(value_src, 1); } @@ -385,6 +429,14 @@ impl HashSortedMap { } } +// ── Chain-slot helpers for sort_by_hash ───────────────────────────────── + +/// Map a flat position (0..chain.len()*GROUP_SIZE) to a (group_index, slot). +#[inline] +fn chain_slot(chain: &[u32], pos: usize) -> (usize, usize) { + (chain[pos / GROUP_SIZE] as usize, pos % GROUP_SIZE) +} + // ──────────────────────────────────────────────────────────────────────── // Entry API // ──────────────────────────────────────────────────────────────────────── @@ -510,7 +562,7 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> { let (new_gi, new_group) = unsafe { let map = &mut *map; if map.num_groups as usize == map.groups.len() { - return insert_after_grow(map, hash, key, value); + return insert_after_grow(map, key, value); } let new_gi = map.num_groups as usize; map.num_groups += 1; @@ -518,12 +570,12 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> { (new_gi, new_group) }; unsafe { - // SAFETY: `tail` was obtained from `&mut self.groups[..]` and + // SAFETY: `tail` was obtained from `&mut groups[..]` and // remains valid because no reallocation occurred between // `entry()` and now (we hold the only `&mut self`). (*tail).overflow = new_gi as u32; } - (new_group, slot_hint(hash)) + (new_group, 0) } }; @@ -541,42 +593,17 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> { } /// Cold path: the chain was full, the table is at capacity, and we need to -/// grow before inserting. Re-walks via the slow path after grow. -/// -/// After `grow()` doubles `num_primary` (`n_bits += 1`), our key's new -/// primary group can have at most ~half the old chain's keys, so hitting -/// `NeedsOverflow` again would require `GROUP_SIZE` keys to all collide on -/// one extra bit of hash — essentially impossible for any reasonable hash. -/// (`insert_for_grow` relies on the same assumption to skip its own -/// capacity check.) +/// grow before inserting. Grows the map, then re-walks via `entry()` to find +/// the new insertion slot. #[cold] #[inline(never)] fn insert_after_grow( map: &mut HashSortedMap, - hash: u64, key: K, value: V, ) -> &mut V { map.grow(); - match map.find_or_insertion_slot(hash, &key) { - FindResult::Vacant(Insertion::Empty { group, slot }) => { - let tag = tag(hash); - // SAFETY: `group` points into `map.groups` and is valid for `'a`. - unsafe { - let g = &mut *group; - g.ctrl[slot] = tag; - g.keys[slot] = MaybeUninit::new(key); - g.values[slot] = MaybeUninit::new(value); - map.len += 1; - g.values[slot].assume_init_mut() - } - } - // After grow, the new primary group for `key` cannot be full (see - // function docs), and the key wasn't in the table before grow. - FindResult::Vacant(Insertion::NeedsOverflow { .. }) | FindResult::Found(_) => { - unreachable!("post-grow walk must hit an empty slot") - } - } + map.entry(key).or_insert(value) } impl Drop for HashSortedMap { @@ -812,4 +839,115 @@ mod tests { assert_eq!(m.get(&i), Some(&i)); } } + + // ── sort_by_hash tests ────────────────────────────────────────────── + + #[test] + fn sort_by_hash_empty() { + let mut map: HashSortedMap = HashSortedMap::new(); + map.sort_by_hash(); + assert_eq!(map.len(), 0); + } + + #[test] + fn sort_by_hash_single() { + let mut map = HashSortedMap::new(); + map.insert(42u32, "hello"); + map.sort_by_hash(); + assert_eq!(map.len(), 1); + let entries: Vec<_> = map.into_iter().collect(); + assert_eq!(entries, vec![(42, "hello")]); + } + + #[test] + fn sort_by_hash_preserves_entries() { + let mut map = HashSortedMap::new(); + for i in 0..200u32 { + map.insert(i, i * 10); + } + map.sort_by_hash(); + assert_eq!(map.len(), 200); + // Lookups must still work after sorting. + for i in 0..200u32 { + assert_eq!(map.get(&i), Some(&(i * 10)), "get failed for key {i}"); + } + let mut entries: Vec<_> = map.into_iter().collect(); + entries.sort_by_key(|&(k, _)| k); + for i in 0..200u32 { + assert_eq!(entries[i as usize], (i, i * 10), "missing key {i}"); + } + } + + #[test] + fn sort_by_hash_produces_hash_order() { + use std::collections::hash_map::RandomState; + + let hasher = RandomState::new(); + let mut map = HashSortedMap::with_hasher(hasher.clone()); + for i in 0..500u32 { + map.insert(i, i); + } + map.sort_by_hash(); + // Iteration should now yield entries in (hash, key) order. + let mut prev_hash = 0u64; + let mut prev_key = 0u32; + let mut first = true; + for (&k, _) in &map { + let h = hasher.hash_one(k); + if !first { + assert!( + (h, k) >= (prev_hash, prev_key), + "(hash, key) order violated: ({prev_hash:#x}, {prev_key}) > ({h:#x}, {k})" + ); + } + prev_hash = h; + prev_key = k; + first = false; + } + } + + #[test] + fn sort_by_hash_with_overflow() { + // Force overflow chains via fixed hash — all keys collide, so sort + // should produce key order as tie-breaker. + let mut map = HashSortedMap::with_capacity_and_hasher(1, FixedState(0)); + for i in 0..50u32 { + map.insert(i, i); + } + map.sort_by_hash(); + assert_eq!(map.len(), 50); + // All hashes are equal, so entries should be in key order. + let entries: Vec<_> = map.into_iter().collect(); + for i in 0..50u32 { + assert_eq!(entries[i as usize], (i, i), "key order violated at {i}"); + } + } + + #[test] + fn sort_by_hash_with_strings() { + use std::collections::hash_map::RandomState; + + let hasher = RandomState::new(); + let mut map = HashSortedMap::with_hasher(hasher.clone()); + for i in 0..100u32 { + map.insert(format!("key-{i}"), format!("val-{i}")); + } + map.sort_by_hash(); + assert_eq!(map.len(), 100); + let mut prev_hash = 0u64; + let mut prev_key = String::new(); + let mut first = true; + for (k, _) in &map { + let h = hasher.hash_one(k); + if !first { + assert!( + (h, k) >= (prev_hash, &prev_key), + "(hash, key) order violated" + ); + } + prev_hash = h; + prev_key = k.clone(); + first = false; + } + } } diff --git a/crates/hash-sorted-map/src/iter.rs b/crates/hash-sorted-map/src/iter.rs new file mode 100644 index 0000000..e981bad --- /dev/null +++ b/crates/hash-sorted-map/src/iter.rs @@ -0,0 +1,408 @@ +use std::marker::PhantomData; +use std::mem::ManuallyDrop; + +use crate::group_ops::{CTRL_EMPTY, GROUP_SIZE}; + +use super::group::Group; +use super::hash_sorted_map::{HashSortedMap, NO_OVERFLOW}; + +/// State shared by `Iter`, `IterMut`, and `IntoIter`: tracks which primary +/// group we're visiting and where we are within that group's overflow chain. +struct IterCursor { + /// Index of the next primary group to visit (0..num_primary). + primary: u32, + /// Number of primary groups (1 << n_bits). + num_primary: u32, + /// Current position within the group we're scanning: group index in the + /// groups array, and a SIMD bitmask of remaining occupied slots. + current_group: u32, + current_slot: u32, +} + +impl IterCursor { + fn new(n_bits: u32) -> Self { + let num_primary = 1u32 << n_bits; + Self { + primary: 0, + num_primary, + current_group: 0, + current_slot: 0, + } + } + + /// Advance to the next occupied slot, returning `(group_index, slot)`. + /// Visits primary groups 0..num_primary in order; for each, follows the + /// overflow chain. Within each group, yields occupied slots via bitmask. + fn next_slot(&mut self, groups: &[Group]) -> Option<(usize, usize)> { + loop { + let gi = self.current_group as usize; + if self.current_slot < GROUP_SIZE as u32 { + let slot = self.current_slot; + if groups[gi].ctrl[slot as usize] != CTRL_EMPTY { + self.current_slot += 1; + return Some((gi, slot as usize)); + } + } + // Current group exhausted — try overflow chain. + if gi < groups.len() && groups[gi].overflow != NO_OVERFLOW { + self.current_group = groups[gi].overflow; + self.current_slot = 0; + continue; + } + self.primary += 1; + // No more overflow — move to next primary group. + if self.primary >= self.num_primary { + return None; + } + self.current_group = self.primary; + self.current_slot = 0; + } + } +} + +/// Immutable iterator over `(&K, &V)` pairs. +pub struct Iter<'a, K, V> { + groups: &'a [Group], + cursor: IterCursor, +} + +impl<'a, K, V> Iterator for Iter<'a, K, V> { + type Item = (&'a K, &'a V); + fn next(&mut self) -> Option { + let (gi, slot) = self.cursor.next_slot(self.groups)?; + let group = &self.groups[gi]; + // SAFETY: slot is occupied (bitmask guarantees ctrl byte has high bit set). + unsafe { + Some(( + group.keys[slot].assume_init_ref(), + group.values[slot].assume_init_ref(), + )) + } + } +} + +/// Mutable iterator over `(&K, &mut V)` pairs. +pub struct IterMut<'a, K, V> { + groups: *mut [Group], + cursor: IterCursor, + _marker: PhantomData<&'a mut [Group]>, +} + +impl<'a, K, V> Iterator for IterMut<'a, K, V> { + type Item = (&'a K, &'a mut V); + fn next(&mut self) -> Option { + // SAFETY: we use raw pointer to avoid holding multiple &mut borrows. + // The cursor guarantees each slot is yielded at most once. + let groups = unsafe { &mut *self.groups }; + let (gi, slot) = self.cursor.next_slot(groups)?; + let group = &mut groups[gi]; + unsafe { + Some(( + group.keys[slot].assume_init_ref(), + group.values[slot].assume_init_mut(), + )) + } + } +} + +/// Owning iterator that yields `(K, V)` pairs and consumes the map. +pub struct IntoIter { + groups: Box<[Group]>, + len: usize, + cursor: IterCursor, +} + +impl Iterator for IntoIter { + type Item = (K, V); + fn next(&mut self) -> Option { + let (gi, slot) = self.cursor.next_slot(&self.groups)?; + let group = &self.groups[gi]; + // SAFETY: slot is occupied (bitmask guarantees ctrl byte has high bit set). + unsafe { + Some(( + group.keys[slot].assume_init_read(), + group.values[slot].assume_init_read(), + )) + } + } + + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.len)) + } +} + +impl Drop for IntoIter { + fn drop(&mut self) { + // Continue iterating to drop remaining entries one by one. + while let Some((gi, slot)) = self.cursor.next_slot(&self.groups) { + unsafe { + self.groups[gi].keys[slot].assume_init_drop(); + self.groups[gi].values[slot].assume_init_drop(); + } + } + } +} + +// ── HashSortedMap iteration ───────────────────────────────────────────── + +impl HashSortedMap { + /// Returns an iterator over `(&K, &V)` pairs. + /// + /// Entries are visited in group-index order (primary groups in order of + /// hash prefix, each followed by its overflow chain). Within each group, + /// occupied slots are visited in slot order. + pub fn iter(&self) -> Iter<'_, K, V> { + Iter { + groups: &self.groups, + cursor: IterCursor::new(self.n_bits), + } + } + + /// Returns a mutable iterator over `(&K, &mut V)` pairs. + pub fn iter_mut(&mut self) -> IterMut<'_, K, V> { + let cursor = IterCursor::new(self.n_bits); + IterMut { + groups: &mut *self.groups as *mut [Group], + cursor, + _marker: PhantomData, + } + } + + /// Consumes the map and returns an iterator over `(K, V)` pairs. + #[allow(clippy::should_implement_trait)] + pub fn into_iter(self) -> IntoIter { + let cursor = IterCursor::new(self.n_bits); + // Prevent Drop from running on self — we're moving groups out. + let mut this = ManuallyDrop::new(self); + let groups = unsafe { std::ptr::read(&this.groups) }; + let len = this.len; + // Zero out len so if Drop somehow runs it sees an empty map. + this.len = 0; + IntoIter { + groups, + len, + cursor, + } + } +} + +impl IntoIterator for HashSortedMap { + type Item = (K, V); + type IntoIter = IntoIter; + fn into_iter(self) -> Self::IntoIter { + self.into_iter() + } +} + +impl<'a, K, V, S> IntoIterator for &'a HashSortedMap { + type Item = (&'a K, &'a V); + type IntoIter = Iter<'a, K, V>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl<'a, K, V, S> IntoIterator for &'a mut HashSortedMap { + type Item = (&'a K, &'a mut V); + type IntoIter = IterMut<'a, K, V>; + fn into_iter(self) -> Self::IntoIter { + self.iter_mut() + } +} + +#[cfg(test)] +mod tests { + use std::hash::{BuildHasher, Hasher}; + + use super::*; + + /// Degenerate hasher that returns a fixed hash code, for forcing collisions. + struct FixedHasher(u64); + + impl Hasher for FixedHasher { + fn finish(&self) -> u64 { + self.0 + } + fn write(&mut self, _bytes: &[u8]) {} + } + + #[derive(Clone)] + struct FixedState(u64); + + impl BuildHasher for FixedState { + type Hasher = FixedHasher; + fn build_hasher(&self) -> FixedHasher { + FixedHasher(self.0) + } + } + + #[test] + fn iter_empty() { + let map: HashSortedMap = HashSortedMap::new(); + assert_eq!(map.iter().count(), 0); + } + + #[test] + fn iter_yields_all_entries() { + let mut map = HashSortedMap::new(); + for i in 0..100u32 { + map.insert(i, i * 10); + } + let mut collected: Vec<(u32, u32)> = map.iter().map(|(&k, &v)| (k, v)).collect(); + collected.sort(); + assert_eq!(collected.len(), 100); + for i in 0..100u32 { + assert_eq!(collected[i as usize], (i, i * 10)); + } + } + + #[test] + fn iter_with_overflow_chains() { + let mut map = HashSortedMap::with_capacity_and_hasher(1, FixedState(0xABCD)); + for i in 0..50u32 { + map.insert(i, i); + } + let collected: Vec = map.iter().map(|(&k, _)| k).collect(); + assert_eq!(collected.len(), 50); + let mut sorted = collected.clone(); + sorted.sort(); + sorted.dedup(); + assert_eq!(sorted.len(), 50); + } + + #[test] + fn iter_mut_mutates_values() { + let mut map = HashSortedMap::new(); + for i in 0..20u32 { + map.insert(i, i); + } + for (_, v) in map.iter_mut() { + *v *= 2; + } + for i in 0..20u32 { + assert_eq!(map.get(&i), Some(&(i * 2))); + } + } + + #[test] + fn into_iter_yields_all() { + let mut map = HashSortedMap::new(); + for i in 0..100u32 { + map.insert(i, i * 3); + } + let mut collected: Vec<(u32, u32)> = map.into_iter().collect(); + collected.sort(); + assert_eq!(collected.len(), 100); + for i in 0..100u32 { + assert_eq!(collected[i as usize], (i, i * 3)); + } + } + + #[test] + fn into_iter_partial_consume_then_drop() { + let mut map: HashSortedMap = HashSortedMap::new(); + for i in 0..50u32 { + map.insert(format!("key-{i}"), format!("val-{i}")); + } + let mut iter = map.into_iter(); + for _ in 0..10 { + let _ = iter.next(); + } + drop(iter); + } + + #[test] + fn into_iter_empty() { + let map: HashSortedMap = HashSortedMap::new(); + assert_eq!(map.into_iter().count(), 0); + } + + #[test] + fn into_iter_with_overflow() { + let mut map = HashSortedMap::with_capacity_and_hasher(1, FixedState(0)); + for i in 0..80u32 { + map.insert(i, i); + } + let collected: Vec<(u32, u32)> = map.into_iter().collect(); + assert_eq!(collected.len(), 80); + let mut keys: Vec = collected.into_iter().map(|(k, _)| k).collect(); + keys.sort(); + keys.dedup(); + assert_eq!(keys.len(), 80); + } + + #[test] + fn into_iter_after_grow() { + let mut map = HashSortedMap::with_capacity(1); + for i in 0..500u32 { + map.insert(i, i); + } + let collected: Vec<(u32, u32)> = map.into_iter().collect(); + assert_eq!(collected.len(), 500); + } + + /// Track drops to verify no leaks or double-drops. + #[test] + fn into_iter_drop_count() { + use std::cell::Cell; + use std::rc::Rc; + + #[derive(Clone)] + struct Tracked(Rc>); + impl Drop for Tracked { + fn drop(&mut self) { + self.0.set(self.0.get() + 1); + } + } + + let counter = Rc::new(Cell::new(0usize)); + let n = 100; + { + let mut map = HashSortedMap::new(); + for i in 0..n { + map.insert(i, Tracked(counter.clone())); + } + let mut iter = map.into_iter(); + for _ in 0..n / 2 { + let _ = iter.next(); + } + } + assert_eq!(counter.get(), n); + } + + #[test] + fn for_loop_ref() { + let mut map = HashSortedMap::new(); + map.insert(1, "a"); + map.insert(2, "b"); + let mut count = 0; + for (_k, _v) in &map { + count += 1; + } + assert_eq!(count, 2); + } + + #[test] + fn for_loop_mut() { + let mut map = HashSortedMap::new(); + map.insert(1u32, 10u32); + map.insert(2, 20); + for (_, v) in &mut map { + *v += 1; + } + assert_eq!(map.get(&1), Some(&11)); + assert_eq!(map.get(&2), Some(&21)); + } + + #[test] + fn for_loop_owned() { + let mut map = HashSortedMap::new(); + map.insert(1, 10); + map.insert(2, 20); + let mut sum = 0; + for (_k, v) in map { + sum += v; + } + assert_eq!(sum, 30); + } +} diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs index 79dac69..3ff5461 100644 --- a/crates/hash-sorted-map/src/lib.rs +++ b/crates/hash-sorted-map/src/lib.rs @@ -1,4 +1,7 @@ +mod group; mod group_ops; mod hash_sorted_map; +mod iter; pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacantEntry}; +pub use iter::{IntoIter, Iter, IterMut};