diff --git a/Cargo.toml b/Cargo.toml
index 312f46d..7547f1b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
+    "crates/hash-sorted-map/benchmarks",
 ]
 resolver = "2"
 
diff --git a/crates/hash-sorted-map/Cargo.toml b/crates/hash-sorted-map/Cargo.toml
index 84ffa02..6eac82f 100644
--- a/crates/hash-sorted-map/Cargo.toml
+++ b/crates/hash-sorted-map/Cargo.toml
@@ -8,3 +8,5 @@ repository = "https://github.com/github/rust-gems"
 license = "MIT"
 keywords = ["hashmap", "sorted", "merge", "simd"]
 categories = ["algorithms", "data-structures"]
+
+[dependencies]
diff --git a/crates/hash-sorted-map/OPTIMIZATIONS.md b/crates/hash-sorted-map/OPTIMIZATIONS.md
index 0b04520..9019582 100644
--- a/crates/hash-sorted-map/OPTIMIZATIONS.md
+++ b/crates/hash-sorted-map/OPTIMIZATIONS.md
@@ -4,8 +4,8 @@
 
 `HashSortedMap` is a Swiss-table-inspired hash map that uses **overflow
 chaining** (instead of open addressing), **SIMD group scanning** (NEON/SSE2),
-a **slot-hint fast path**, and an **optimized growth strategy**. It is generic
-over key type, value type, and hash builder.
+and an **optimized growth strategy**. It is generic over key type, value type,
+and hash builder.
 
 This document analyzes the design trade-offs versus
 [hashbrown](https://github.com/rust-lang/hashbrown) and records the
@@ -38,7 +38,6 @@ experimental results that guided the current design.
 │  • Overflow chaining (linked groups)                             │
 │  • 8-byte groups with NEON/SSE2/scalar SIMD scan                 │
 │  • EMPTY / FULL tag states only (insertion-only, no deletion)    │
-│  • Slot-hint fast path                                           │
 └──────────────────────────────────────────────────────────────────┘
 ```
 
@@ -106,17 +105,33 @@ the overflow path.
 SIMD version** by pessimizing NEON code generation. Removed from the SIMD
 implementation, kept in the scalar version.
 
-### 7. Slot Hint Fast Path (Unique to HashSortedMap)
+### 7. Slot Hint Fast Path ❌ Removed
 
-HashSortedMap checks a preferred slot before scanning the group:
+Originally, HashSortedMap checked a preferred slot before scanning the group:
 ```rust
 let hint = slot_hint(hash);  // 3 bits from hash → slot index
 if ctrl[hint] == EMPTY { /* direct insert */ }
 if ctrl[hint] == tag && keys[hint] == key { /* direct hit */ }
 ```
 
-hashbrown does **not** have this optimization — it always does a full SIMD
-group scan. The reason why the performance is different is probably due to the different overflow strategies and the different load factors.
+**Experimental finding**: This scalar check **hurts performance** on random
+workloads. The branch predictor cannot help because random keys map to random
+slots, making the hint check a 50/50 branch that pollutes the branch
+predictor. SIMD-only scanning (match_tag + match_empty) is uniformly fast
+regardless of key distribution.
+
+**Structural benefit of removal**: Without the slot hint, inserts always
+append to the first empty slot. This guarantees that occupied slots are
+**packed contiguously from the beginning** of each group (no gaps). This
+invariant enables:
+- `count_occupied()`: a single `leading_zeros()` on the ctrl word replaces
+  bitmask scanning to find the next free slot or count entries
+- Simpler `insert_for_grow()`: just write at position `count_occupied()`
+- Simpler iteration: occupied slots are always `0..count_occupied()`
+- Simpler `sort_by_hash()`: no need to compact gaps before sorting
+
+**Current state**: Slot hint is fully removed. All paths use SIMD group
+scanning for lookups and `count_occupied()` for finding the insertion point.
 
 ### 8. Overflow Reserve Sizing ✅ Validated
 
@@ -159,13 +174,93 @@ entropy in both halves. Also changed trigram generation to use
 
 ## Summary of Impact
 
-| Change                     | Effect on insert time        |
-|----------------------------|------------------------------|
-| Capacity sizing fix        | **−50%** (biggest win)       |
-| Optimized growth path      | **−10%** on growth scenarios |
-| SIMD group scanning        | **−5%**                      |
-| Branch hints (scalar only) | **−2–6%**                    |
-| IdentityHasher fix         | Enabled fair comparison      |
+| Change                          | Effect                              |
+|---------------------------------|-------------------------------------|
+| Capacity sizing fix             | **−50%** insert time (biggest win)  |
+| Optimized growth path           | **2× faster** growth than hashbrown |
+| SIMD group scanning             | **−5%** insert time                 |
+| Slot hint removal               | **−25%** merge latency, contiguous packing |
+| Branch hints (scalar only)      | **−2–6%**                           |
+| IdentityHasher fix              | Enabled fair comparison             |
 
-The current HashSortedMap **matches hashbrown+FxHash** on pre-sized inserts,
-**beats all hashbrown variants** on overwrites, and has **2× faster growth**.
+---
+
+## Benchmark Results (local x86_64 snapshot)
+
+Hardware used for the current local snapshot:
+
+- CPU: Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+- Architecture: x86_64
+- Topology: 1 socket, 1 core, 2 threads
+- CPU frequency range: 800 MHz to 2800 MHz
+- Memory: 7.8 GiB RAM
+
+### Insert (1000 trigrams, pre-sized)
+
+| Implementation       | Time (µs) | vs hashbrown |
+|----------------------|-----------|--------------|
+| FoldHashMap          | 13.88     | −5%          |
+| FxHashMap            | 14.60     | ~0%          |
+| hashbrown+Identity   | 14.44     | baseline     |
+| hashbrown::HashMap   | 14.55     | +1%          |
+| std::HashMap+FNV     | 15.55     | +8%          |
+| AHashMap             | 15.59     | +8%          |
+| **HashSortedMap**    | **9.40**  | **−35%**     |
+| std::HashMap         | 25.26     | +75%         |
+
+### Reinsert (1000 trigrams, all keys exist)
+
+| Implementation       | Time (µs) |
+|----------------------|-----------|
+| **HashSortedMap**    | **6.59**  |
+| hashbrown+Identity   | 6.95      |
+
+### Growth (128 → 1000 trigrams, 3 resize rounds)
+
+| Implementation       | Time (µs) |
+|----------------------|-----------|
+| hashbrown+Identity   | 26.66     |
+| **HashSortedMap**    | **27.50** |
+
+### Count (4000 trigrams, mixed insert/update)
+
+| Implementation                   | Time (µs) |
+|----------------------------------|-----------|
+| hashbrown+Identity entry()       | 15.49     |
+| **HashSortedMap get_or_default** | **15.88** |
+| **HashSortedMap entry().or_default()** | **16.15** |
+
+### Iteration (1000 trigrams)
+
+| Implementation                | Time (µs) |
+|-------------------------------|-----------|
+| **HashSortedMap iter()**      | **3.02**  |
+| hashbrown+Identity iter()     | 3.04      |
+| **HashSortedMap into_iter()** | **3.03**  |
+| hashbrown+Identity into_iter()| 3.56      |
+
+### Sort (100K trigrams)
+
+| Implementation              | Time (ms) |
+|-----------------------------|-----------|
+| **HashSortedMap sort_by_hash** | **1.66** |
+| Vec::sort_unstable          | 2.20      |
+
+### Merge (100 maps × 100K keys each → sorted output)
+
+| Implementation                    | Time (ms) | vs HSM merge+sort |
+|-----------------------------------|-----------|--------------------|
+| hashbrown merge presized          | 160.79    | +6%               |
+| **HashSortedMap merge presized**  | **117.01**| **−23%**          |
+| **HashSortedMap merge (no sort)** | **141.57**| **−7%**           |
+| hashbrown merge                   | 163.59    | +7%               |
+| **HashSortedMap merge + sort**    | **152.34**| **baseline**      |
+| hashbrown merge + Vec sort        | 193.37    | +27%              |
+| k-way merge sorted vecs           | 445       | +192%             |
+
+**Key takeaways:**
+- Pre-sized insert is **~35% faster** than hashbrown+Identity
+- Reinsert and iter paths are now close to parity with hashbrown+Identity
+- Growth path is currently **~3% slower** than hashbrown+Identity
+- sort_by_hash is **~24% faster** than Vec::sort_unstable
+- merge + sort is **~21% faster** than hashbrown merge + Vec sort
diff --git a/crates/hash-sorted-map/README.md b/crates/hash-sorted-map/README.md
index ebd5ef6..bbf6e3b 100644
--- a/crates/hash-sorted-map/README.md
+++ b/crates/hash-sorted-map/README.md
@@ -29,8 +29,8 @@ keys, which means:
 
 - **Overflow chaining** instead of open addressing — groups that fill up link
   to overflow groups rather than probing into neighbours.
-- **Slot hint** — a preferred slot index derived from the hash, checked before
-  scanning the group. Gives a direct hit on most inserts at low load.
+- **Contiguous packing** — occupied slots are always packed from position 0
+  with no gaps, enabling a single `leading_zeros()` to find the next free slot.
 - **SIMD group scanning** — uses NEON on aarch64, SSE2 on x86\_64, and a
   scalar fallback elsewhere to scan 8–16 control bytes in parallel.
 - **AoS group layout** — each group stores its control bytes, keys, and values
@@ -42,45 +42,32 @@ keys, which means:
 
 ## Benchmark results
 
-All benchmarks insert 1000 random trigram hashes (scrambled with
-`folded_multiply`) into maps with various configurations. Measured on Apple
-M-series (aarch64).
-
-### Insert 1000 trigrams — pre-sized, no growth
-
-| Rank | Map | Time (µs) | vs best |
-|------|-----|-----------|---------|
-| 🥇 | FoldHashMap | 2.44 | — |
-| 🥈 | FxHashMap | 2.61 | +7% |
-| 🥉 | hashbrown::HashMap | 2.67 | +9% |
-| 4 | **HashSortedMap** | **2.71** | +11% |
-| 5 | hashbrown+Identity | 2.74 | +12% |
-| 6 | std::HashMap+FNV | 3.27 | +34% |
-| 7 | AHashMap | 3.22 | +32% |
-| 8 | std::HashMap | 8.49 | +248% |
-
-### Re-insert same keys (all overwrites)
-
-| Map | Time (µs) |
-|-----|-----------|
-| **HashSortedMap** | **2.36** ✅ |
-| hashbrown+Identity | 2.58 |
-
-### Growth from small (`with_capacity(128)`, 3 resize rounds)
-
-| Map | Time (µs) | Growth penalty |
-|-----|-----------|----------------|
-| **HashSortedMap** | **4.85** | +2.14 |
-| hashbrown+Identity | 9.77 | +7.03 |
-
-### Key takeaways
-
-- **HashSortedMap matches the fastest hashbrown configurations** on pre-sized
-  first-time inserts and is **the fastest for overwrites**.
-- **Growth is ~2× faster** than hashbrown thanks to the optimized
-  `insert_for_grow` path that skips duplicate checking and uses raw copies.
-- The remaining gap to FoldHashMap (~11%) comes from foldhash's extremely
-  efficient hash function that pipelines well with hashbrown's SIMD scan.
+Latest local Criterion snapshot from this repository's
+`target/criterion` outputs (lower is better):
+
+Hardware used for this snapshot:
+
+- CPU: Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+- Architecture: x86_64
+- Topology: 1 socket, 1 core, 2 threads
+- CPU frequency range: 800 MHz to 2800 MHz
+- Memory: 7.8 GiB RAM
+
+| Scenario                                     | HashSortedMap | Comparison                             | Result      |
+| :------------------------------------------- | ------------: | :------------------------------------- | :---------- |
+| Insert 1000 trigrams (pre-sized)             |       9.40 µs | hashbrown::HashMap: 14.55 µs           | ~35% faster |
+| Grow from capacity 128                       |      27.50 µs | hashbrown+Identity: 26.66 µs           | ~3% slower  |
+| Count 4000 trigrams (`entry().or_default()`) |      16.15 µs | hashbrown+Identity `entry()`: 15.49 µs | ~4% slower  |
+| Iterate 1000 trigrams (`iter()`)             |       3.02 µs | hashbrown+Identity `iter()`: 3.04 µs   | ~1% faster  |
+| Sort 100000 trigrams by hash                 |       1.66 ms | `Vec::sort_unstable`: 2.20 ms          | ~24% faster |
+| Merge 100 sorted maps + final sort           |     152.34 ms | hashbrown merge + vec sort: 193.37 ms  | ~21% faster |
+
+Key takeaways:
+
+- Pre-sized inserts, sorting, and merge+sort remain the strongest paths.
+- Iteration is now roughly on par with `hashbrown+Identity`.
+- Growth and count/update workloads are currently slightly slower than
+  `hashbrown+Identity` in this run.
 
 ## Running
 
diff --git a/crates/hash-sorted-map/benchmarks/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml
index 9ee37dc..91019a4 100644
--- a/crates/hash-sorted-map/benchmarks/Cargo.toml
+++ b/crates/hash-sorted-map/benchmarks/Cargo.toml
@@ -21,3 +21,4 @@ ahash = "0.8"
 hashbrown = "0.15"
 foldhash = "0.1"
 fnv = "1"
+itertools = "0.14"
diff --git a/crates/hash-sorted-map/benchmarks/performance.rs b/crates/hash-sorted-map/benchmarks/performance.rs
index 5a04801..07cfc2f 100644
--- a/crates/hash-sorted-map/benchmarks/performance.rs
+++ b/crates/hash-sorted-map/benchmarks/performance.rs
@@ -1,6 +1,9 @@
+use std::hash::BuildHasher;
+
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
 use hash_sorted_map::HashSortedMap;
-use hash_sorted_map_benchmarks::{random_trigram_hashes, IdentityBuildHasher};
+use hash_sorted_map_benchmarks::{folded_multiply, random_trigram_hashes, IdentityBuildHasher};
+use rand::RngExt;
 
 fn trigrams() -> Vec<u32> {
     random_trigram_hashes(1000)
@@ -291,11 +294,311 @@ fn bench_count(c: &mut Criterion) {
     group.finish();
 }
 
+fn bench_iter(c: &mut Criterion) {
+    let trigrams = trigrams();
+
+    let mut group = c.benchmark_group("iter_1000_trigrams");
+
+    group.bench_function("hashbrown+Identity iter()", |b| {
+        b.iter_batched(
+            || {
+                let mut map =
+                    hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                        trigrams.len(),
+                        Default::default(),
+                    );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |map| {
+                let mut sum = 0usize;
+                for (&k, &v) in &map {
+                    sum = sum.wrapping_add(v).wrapping_add(k as usize);
+                }
+                sum
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap iter()", |b| {
+        b.iter_batched(
+            || {
+                let mut map = HashSortedMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |map| {
+                let mut sum = 0usize;
+                for (&k, &v) in &map {
+                    sum = sum.wrapping_add(v).wrapping_add(k as usize);
+                }
+                sum
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("hashbrown+Identity into_iter()", |b| {
+        b.iter_batched(
+            || {
+                let mut map =
+                    hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                        trigrams.len(),
+                        Default::default(),
+                    );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |map| {
+                let mut sum = 0usize;
+                for (k, v) in map {
+                    sum = sum.wrapping_add(v).wrapping_add(k as usize);
+                }
+                sum
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap into_iter()", |b| {
+        b.iter_batched(
+            || {
+                let mut map = HashSortedMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |map| {
+                let mut sum = 0usize;
+                for (k, v) in map {
+                    sum = sum.wrapping_add(v).wrapping_add(k as usize);
+                }
+                sum
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+fn bench_sort(c: &mut Criterion) {
+    let keys = random_trigram_hashes(100_000);
+    let hasher = IdentityBuildHasher::default();
+    let mut group = c.benchmark_group("sort_100000_trigrams");
+
+    group.bench_function("Vec::sort_unstable", |b| {
+        b.iter(|| {
+            let mut vec: Vec<_> = keys.iter().enumerate().map(|(i, &key)| (key, i)).collect();
+            vec.sort_unstable_by(|a, b| {
+                let ha = hasher.hash_one(a.0);
+                let hb = hasher.hash_one(b.0);
+                (ha, a.0).cmp(&(hb, b.0))
+            });
+            vec
+        });
+    });
+
+    group.bench_function("HashSortedMap sort_by_hash", |b| {
+        b.iter(|| {
+            let mut map =
+                HashSortedMap::with_capacity_and_hasher(keys.len(), IdentityBuildHasher::default());
+            for (i, &key) in keys.iter().enumerate() {
+                map.insert(key, i);
+            }
+            map.sort_by_hash();
+            map
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_merge_sort(c: &mut Criterion) {
+    const NUM_MAPS: usize = 100;
+    const KEYS_PER_MAP: usize = 100_000;
+
+    // Pre-generate 100 key vectors with random u32 values scrambled via folded_multiply.
+    let maps_data: Vec<Vec<u32>> = (0..NUM_MAPS)
+        .map(|_| {
+            let mut rng = rand::rng();
+            (0..KEYS_PER_MAP)
+                .map(|_| {
+                    folded_multiply(rng.random_range(0..1_000_000u32) as u64, 0x243f6a8885a308d3)
+                        as u32
+                })
+                .collect()
+        })
+        .collect();
+
+    // Pre-build sorted containers from the input data.
+    let hash_maps: Vec<_> = maps_data
+        .into_iter()
+        .map(|keys| {
+            let mut map = HashSortedMap::with_hasher(IdentityBuildHasher::default());
+            for key in keys {
+                *map.entry(key).or_default() += 1u32;
+            }
+            map
+        })
+        .collect();
+
+    let hasher = IdentityBuildHasher::default();
+    let mut group = c.benchmark_group("merge_100_maps_sorted");
+    group.sample_size(10);
+
+    // ── 1. HashSortedMap: merge sorted containers, then sort_by_hash ─
+    group.bench_function("HashSortedMap merge + sort_by_hash", |b| {
+        b.iter(|| {
+            let mut map: HashSortedMap<u32, u32, _> =
+                HashSortedMap::with_hasher(IdentityBuildHasher::default());
+            for container in &hash_maps {
+                for (&key, &value) in container {
+                    *map.entry(key).or_default() += value;
+                }
+            }
+            map.sort_by_hash();
+            map
+        });
+    });
+
+    // ── 2. K-way merge over pre-sorted containers ────────────────────
+    group.bench_function("k-way merge sorted containers", |b| {
+        use itertools::Itertools;
+
+        b.iter(|| {
+            // Phase 1: build per-container sorted (hash, key, count) vectors.
+            let sorted_vecs: Vec<Vec<(u64, u32, u32)>> = hash_maps
+                .iter()
+                .map(|container| {
+                    let mut vec: Vec<(u64, u32, u32)> = container
+                        .iter()
+                        .map(|(&k, &v)| (hasher.hash_one(k), k, v))
+                        .collect();
+                    vec.sort_unstable_by_key(|&(h, _, _)| h);
+                    vec
+                })
+                .collect();
+
+            // Phase 2: k-merge + group_by to aggregate counts.
+            let result: Vec<(u32, u32)> = sorted_vecs
+                .into_iter()
+                .map(|v| v.into_iter())
+                .kmerge_by(|a, b| (a.0, a.1) <= (b.0, b.1))
+                .chunk_by(|&(_, key, _)| key)
+                .into_iter()
+                .map(|(key, group)| (key, group.map(|(_, _, c)| c).sum()))
+                .collect();
+            result
+        });
+    });
+
+    // ── 3. hashbrown HashMap merge, then sort into Vec ──────────────
+    group.bench_function("hashbrown merge + Vec sort", |b| {
+        b.iter(|| {
+            let mut map = hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_hasher(
+                IdentityBuildHasher::default(),
+            );
+            for container in &hash_maps {
+                for (&key, &value) in container {
+                    *map.entry(key).or_default() += value;
+                }
+            }
+            let mut vec: Vec<(u32, u32)> = map.into_iter().collect();
+            vec.sort_unstable_by(|a, b| {
+                let ha = hasher.hash_one(a.0);
+                let hb = hasher.hash_one(b.0);
+                (ha, a.0).cmp(&(hb, b.0))
+            });
+            vec
+        });
+    });
+
+    // ── 4. hashbrown HashMap merge only (no sort) ───────────────────
+    group.bench_function("hashbrown merge", |b| {
+        b.iter(|| {
+            let mut map = hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_hasher(
+                IdentityBuildHasher::default(),
+            );
+            for container in &hash_maps {
+                for (&key, &value) in container {
+                    *map.entry(key).or_default() += value;
+                }
+            }
+            map
+        });
+    });
+
+    // ── 5. HashSortedMap merge only (no sort) ───────────────────────
+    group.bench_function("HashSortedMap merge", |b| {
+        b.iter(|| {
+            let mut map: HashSortedMap<u32, u32, _> =
+                HashSortedMap::with_hasher(IdentityBuildHasher::default());
+            for container in &hash_maps {
+                for (&key, &value) in container {
+                    *map.entry(key).or_default() += value;
+                }
+            }
+            map
+        });
+    });
+
+    // ── 6. hashbrown presized merge only ────────────────────────────
+    group.bench_function("hashbrown merge presized", |b| {
+        b.iter(|| {
+            let mut map =
+                hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_capacity_and_hasher(
+                    1_000_000,
+                    IdentityBuildHasher::default(),
+                );
+            for container in &hash_maps {
+                for (&key, &value) in container {
+                    *map.entry(key).or_default() += value;
+                }
+            }
+            map
+        });
+    });
+
+    // ── 7. HashSortedMap presized merge only ─────────────────────────
+    group.bench_function("HashSortedMap merge presized", |b| {
+        b.iter(|| {
+            let mut map: HashSortedMap<u32, u32, _> =
+                HashSortedMap::with_capacity_and_hasher(1_000_000, IdentityBuildHasher::default());
+            for container in &hash_maps {
+                for (&key, &value) in container {
+                    *map.entry(key).or_default() += value;
+                }
+            }
+            map
+        });
+    });
+
+    group.finish();
+}
+
 criterion_group!(
     benches,
     bench_insert,
     bench_reinsert,
     bench_grow,
-    bench_count
+    bench_count,
+    bench_iter,
+    bench_sort,
+    bench_merge_sort
 );
 criterion_main!(benches);
diff --git a/crates/hash-sorted-map/src/group.rs b/crates/hash-sorted-map/src/group.rs
new file mode 100644
index 0000000..c1ba315
--- /dev/null
+++ b/crates/hash-sorted-map/src/group.rs
@@ -0,0 +1,23 @@
+use core::mem::MaybeUninit;
+
+use super::group_ops::{CTRL_EMPTY, GROUP_SIZE};
+
+pub(crate) const NO_OVERFLOW: u32 = u32::MAX;
+
+pub(crate) struct Group<K, V> {
+    pub(crate) ctrl: [u8; GROUP_SIZE],
+    pub(crate) keys: [MaybeUninit<K>; GROUP_SIZE],
+    pub(crate) values: [MaybeUninit<V>; GROUP_SIZE],
+    pub(crate) overflow: u32,
+}
+
+impl<K, V> Group<K, V> {
+    pub(crate) fn new() -> Self {
+        Self {
+            ctrl: [CTRL_EMPTY; GROUP_SIZE],
+            keys: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            overflow: NO_OVERFLOW,
+        }
+    }
+}
diff --git a/crates/hash-sorted-map/src/group_ops.rs b/crates/hash-sorted-map/src/group_ops.rs
index a1b92ec..cc14813 100644
--- a/crates/hash-sorted-map/src/group_ops.rs
+++ b/crates/hash-sorted-map/src/group_ops.rs
@@ -38,31 +38,11 @@ mod arch {
         }
     }
 
-    #[inline(always)]
-    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        match_tag(ctrl, super::CTRL_EMPTY)
-    }
-
-    /// Mask of slots whose ctrl byte has the high bit set (occupied).
-    /// Uses SSE2 `_mm_movemask_epi8` which extracts the top bit of each byte.
-    #[inline(always)]
-    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        unsafe {
-            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
-            x86::_mm_movemask_epi8(group) as u32
-        }
-    }
-
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         mask.trailing_zeros() as usize
     }
 
-    #[inline(always)]
-    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
-        mask & !(1u32 << slot)
-    }
-
     #[inline(always)]
     pub fn next_match(mask: &mut Mask) -> Option<usize> {
         if *mask == 0 {
@@ -72,6 +52,13 @@ mod arch {
         *mask &= *mask - 1;
         Some(i)
     }
+
+    /// Number of trailing occupied (non-zero) bytes in the ctrl array.
+    #[inline(always)]
+    pub fn count_occupied(ctrl: &[u8; GROUP_SIZE]) -> usize {
+        let word = u128::from_ne_bytes(*ctrl);
+        GROUP_SIZE - (word.leading_zeros() / 8) as usize
+    }
 }
 
 #[cfg(target_arch = "aarch64")]
@@ -89,34 +76,11 @@ mod arch {
         }
     }
 
-    #[inline(always)]
-    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        unsafe {
-            let group = neon::vld1_u8(ctrl.as_ptr());
-            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(0));
-            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
-        }
-    }
-
-    /// Mask of slots whose ctrl byte has the high bit set (occupied).
-    #[inline(always)]
-    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        unsafe {
-            let group = neon::vld1_u8(ctrl.as_ptr());
-            neon::vget_lane_u64(neon::vreinterpret_u64_u8(group), 0) & 0x8080808080808080
-        }
-    }
-
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         (mask.trailing_zeros() >> 3) as usize
     }
 
-    #[inline(always)]
-    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
-        mask & !(0x80u64 << (slot * 8))
-    }
-
     #[inline(always)]
     pub fn next_match(mask: &mut Mask) -> Option<usize> {
         if *mask == 0 {
@@ -126,6 +90,13 @@ mod arch {
         *mask &= *mask - 1;
         Some(i)
     }
+
+    /// Number of trailing occupied (non-zero) bytes in the ctrl array.
+    #[inline(always)]
+    pub fn count_occupied(ctrl: &[u8; GROUP_SIZE]) -> usize {
+        let word = u64::from_ne_bytes(*ctrl);
+        GROUP_SIZE - (word.leading_zeros() / 8) as usize
+    }
 }
 
 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
@@ -140,29 +111,11 @@ mod arch {
         (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
     }
 
-    #[inline(always)]
-    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        let word = u64::from_ne_bytes(*ctrl);
-        !word & 0x8080808080808080
-    }
-
-    /// Mask of slots whose ctrl byte has the high bit set (occupied).
-    #[inline(always)]
-    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        let word = u64::from_ne_bytes(*ctrl);
-        word & 0x8080808080808080
-    }
-
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         (mask.trailing_zeros() >> 3) as usize
     }
 
-    #[inline(always)]
-    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
-        mask & !(0x80u64 << (slot * 8))
-    }
-
     #[inline(always)]
     pub fn next_match(mask: &mut Mask) -> Option<usize> {
         if *mask == 0 {
@@ -172,6 +125,13 @@ mod arch {
         *mask &= *mask - 1;
         Some(i)
     }
+
+    /// Number of trailing occupied (non-zero) bytes in the ctrl array.
+    #[inline(always)]
+    pub fn count_occupied(ctrl: &[u8; GROUP_SIZE]) -> usize {
+        let word = u64::from_ne_bytes(*ctrl);
+        GROUP_SIZE - (word.leading_zeros() / 8) as usize
+    }
 }
 
 pub use arch::*;
diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index 26a4ecd..0cc37b4 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -4,9 +4,10 @@ use std::collections::hash_map::RandomState;
 use std::hash::{BuildHasher, Hash};
 use std::marker::PhantomData;
 
+use super::group::Group;
 use super::group_ops::{self, CTRL_EMPTY, GROUP_SIZE};
 
-const NO_OVERFLOW: u32 = u32::MAX;
+pub(crate) use super::group::NO_OVERFLOW;
 
 // ── Helpers ─────────────────────────────────────────────────────────────────
 
@@ -15,38 +16,19 @@ fn tag(hash: u64) -> u8 {
     (hash as u8) | 0x80
 }
 
-#[inline]
-fn slot_hint(hash: u64) -> usize {
-    ((hash >> 7) & (GROUP_SIZE as u64 - 1)) as usize
-}
-
-struct Group<K, V> {
-    ctrl: [u8; GROUP_SIZE],
-    keys: [MaybeUninit<K>; GROUP_SIZE],
-    values: [MaybeUninit<V>; GROUP_SIZE],
-    overflow: u32,
-}
-
-impl<K, V> Group<K, V> {
-    fn new() -> Self {
-        Self {
-            ctrl: [CTRL_EMPTY; GROUP_SIZE],
-            keys: [const { MaybeUninit::uninit() }; GROUP_SIZE],
-            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
-            overflow: NO_OVERFLOW,
-        }
-    }
-}
+// ────────────────────────────────────────────────────────────────────────
+// HashSortedMap
+// ────────────────────────────────────────────────────────────────────────
 
 /// Insertion-only hash map with SIMD group scanning.
 ///
 /// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere.
 /// Generic over key type `K`, value type `V`, and hash builder `S`.
 pub struct HashSortedMap<K, V, S = RandomState> {
-    groups: Box<[Group<K, V>]>,
-    num_groups: u32,
-    n_bits: u32,
-    len: usize,
+    pub(crate) groups: Box<[Group<K, V>]>,
+    pub(crate) num_groups: u32,
+    pub(crate) n_bits: u32,
+    pub(crate) len: usize,
     hash_builder: S,
 }
 
@@ -75,19 +57,24 @@ impl<K, V, S> HashSortedMap<K, V, S> {
         let adjusted = (capacity as f64 / group_ops::MAX_FILL).ceil() as usize;
         let min_groups = (adjusted.div_ceil(GROUP_SIZE)).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
-        let (groups, num_primary) = Self::alloc_groups(n_bits);
+        let (groups, num_groups) = Self::alloc_groups(n_bits);
         Self {
             groups,
-            num_groups: num_primary,
+            num_groups,
             n_bits,
             len: 0,
             hash_builder,
         }
     }
 
-    /// Allocate a fully default-initialized boxed slice sized for `n_bits` primary groups
-    /// plus the standard 12.5% overflow reserve. Returns the slice and the number of
-    /// primary groups (which is also the initial in-use count).
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
     fn alloc_groups(n_bits: u32) -> (Box<[Group<K, V>]>, u32) {
         let num_primary = 1usize << n_bits;
         let total = num_primary + num_primary / 8 + 1;
@@ -97,16 +84,123 @@ impl<K, V, S> HashSortedMap<K, V, S> {
     }
 
     #[inline]
-    fn group_index(&self, hash: u64) -> usize {
+    pub(crate) fn group_index(&self, hash: u64) -> usize {
         (hash >> (64 - self.n_bits)) as usize
     }
+}
 
-    pub fn len(&self) -> usize {
-        self.len
-    }
+impl<K: Hash + Eq + Ord, V, S: BuildHasher> HashSortedMap<K, V, S> {
+    /// Sort all entries within each primary group chain by their hash value,
+    /// breaking ties by key.
+    ///
+    /// After sorting, iteration visits entries in hash order within each
+    /// primary group (and since primary groups are visited in group-index
+    /// order, the overall iteration is in full hash order).
+    ///
+    /// # Complexity
+    ///
+    /// Each of `n` elements hashes uniformly into one of `m` primary groups,
+    /// so chain lengths follow `X_i ~ Binomial(n, 1/m)` with `E[X_i] = n/m`.
+    /// With a quadratic sort per chain the total expected cost is:
+    ///
+    /// ```text
+    /// Σ E[X_i²] = m · (Var[X_i] + E[X_i]²)
+    ///           = m · (n/m · (1 − 1/m) + n²/m²)
+    ///           = n · (1 − 1/m) + n²/m
+    /// ```
+    ///
+    /// Dividing by `n` gives the expected cost per element: `1 + n/m` (for
+    /// `m ≫ 1`). Since `n/m` is the average chain length, bounded by
+    /// `GROUP_SIZE / MAX_FILL`, the per-element cost stays constant.
+    pub fn sort_by_hash(&mut self) {
+        let num_primary = 1usize << self.n_bits;
+        let mut chain: Vec<u32> = Vec::new();
+        let mut hashes: Vec<u64> = Vec::new();
+
+        for primary_gi in 0..num_primary {
+            chain.clear();
+            hashes.clear();
+
+            // Collect group indices in this chain.
+            let mut gi = primary_gi;
+            loop {
+                chain.push(gi as u32);
+                let overflow = self.groups[gi].overflow;
+                if overflow == NO_OVERFLOW {
+                    break;
+                }
+                gi = overflow as usize;
+            }
+            // All groups before the last are fully packed (overflow is only
+            // allocated when the previous group is full). Compute hashes for
+            // those directly.
+            for &cgi in &chain[..chain.len() - 1] {
+                let g = &self.groups[cgi as usize];
+                for slot in 0..GROUP_SIZE {
+                    let hash = self
+                        .hash_builder
+                        .hash_one(unsafe { g.keys[slot].assume_init_ref() });
+                    hashes.push(hash);
+                }
+            }
+            let g =
+                &self.groups[*chain.last().expect("chain should have at least one group") as usize];
+            for slot in 0..GROUP_SIZE {
+                if g.ctrl[slot] == CTRL_EMPTY {
+                    break;
+                }
+                let hash = self
+                    .hash_builder
+                    .hash_one(unsafe { g.keys[slot].assume_init_ref() });
+                hashes.push(hash);
+            }
 
-    pub fn is_empty(&self) -> bool {
-        self.len == 0
+            let n = hashes.len();
+            // Insertion sort by (hash, key).
+            for i in 1..n {
+                // Extract element at position i.
+                let cur_hash = hashes[i];
+                let (gi, si) = chain_slot(&chain, i);
+                let cur_key = unsafe { self.groups[gi].keys[si].assume_init_read() };
+                let cur_val = unsafe { self.groups[gi].values[si].assume_init_read() };
+                // Find insertion point via linear scan backward.
+                let mut j = i;
+                while j > 0 {
+                    let (gj, sj) = chain_slot(&chain, j - 1);
+                    let prev_key = unsafe { self.groups[gj].keys[sj].assume_init_ref() };
+                    if (hashes[j - 1], prev_key) <= (cur_hash, &cur_key) {
+                        break;
+                    }
+                    j -= 1;
+                }
+                if j < i {
+                    // Shift positions j..i up by one.
+                    hashes.copy_within(j..i, j + 1);
+                    for pos in (j..i).rev() {
+                        let (src_g, src_s) = chain_slot(&chain, pos);
+                        let (dst_g, dst_s) = chain_slot(&chain, pos + 1);
+                        unsafe {
+                            let k = std::ptr::read(&self.groups[src_g].keys[src_s]);
+                            let v = std::ptr::read(&self.groups[src_g].values[src_s]);
+                            self.groups[dst_g].keys[dst_s] = k;
+                            self.groups[dst_g].values[dst_s] = v;
+                        }
+                    }
+                }
+                // Insert at position j (or write back to i if already in place).
+                hashes[j] = cur_hash;
+                let (gj, sj) = chain_slot(&chain, j);
+                self.groups[gj].keys[sj] = MaybeUninit::new(cur_key);
+                self.groups[gj].values[sj] = MaybeUninit::new(cur_val);
+            }
+            // Rebuild ctrl/tag bytes from the sorted hashes so that
+            // get/insert/entry still work after sorting.
+            // This adds a small performance penalty of maybe 6%.
+            for (pos, &h) in hashes.iter().enumerate() {
+                let (gi, si) = chain_slot(&chain, pos);
+                self.groups[gi].ctrl[si] = tag(h);
+            }
+        }
     }
 }
 
@@ -162,26 +256,11 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
 
     fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option<V> {
         let tag = tag(hash);
-        let hint = slot_hint(hash);
         let mut gi = self.group_index(hash);
         loop {
             let group = &mut self.groups[gi];
-            // Fast path: check preferred slot.
-            let c = group.ctrl[hint];
-            if c == CTRL_EMPTY {
-                group.ctrl[hint] = tag;
-                group.keys[hint] = MaybeUninit::new(key);
-                group.values[hint] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
-                let old = std::mem::replace(unsafe { group.values[hint].assume_init_mut() }, value);
-                return Some(old);
-            }
-            // Slow path: SIMD scan group for tag match.
+            // SIMD scan group for tag match.
             let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
-            tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() } == &key {
                     let old =
@@ -190,12 +269,11 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
                 }
             }
             // Check for empty slot in this group.
-            let empty_mask = group_ops::match_empty(&group.ctrl);
-            if empty_mask != 0 {
-                let i = group_ops::lowest(empty_mask);
-                group.ctrl[i] = tag;
-                group.keys[i] = MaybeUninit::new(key);
-                group.values[i] = MaybeUninit::new(value);
+            let occupied_slots = group_ops::count_occupied(&group.ctrl);
+            if occupied_slots != GROUP_SIZE {
+                group.ctrl[occupied_slots] = tag;
+                group.keys[occupied_slots] = MaybeUninit::new(key);
+                group.values[occupied_slots] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
             }
@@ -214,9 +292,9 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
                 self.num_groups += 1;
                 self.groups[gi].overflow = new_gi as u32;
                 let group = &mut self.groups[new_gi];
-                group.ctrl[hint] = tag;
-                group.keys[hint] = MaybeUninit::new(key);
-                group.values[hint] = MaybeUninit::new(value);
+                group.ctrl[0] = tag;
+                group.keys[0] = MaybeUninit::new(key);
+                group.values[0] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
             }
@@ -229,31 +307,20 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
         Q: Eq + ?Sized,
     {
         let tag = tag(hash);
-        let hint = slot_hint(hash);
         let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
-
-            // Fast path: preferred slot.
-            let c = group.ctrl[hint];
-            if c == tag && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key {
-                return Some(unsafe { group.values[hint].assume_init_ref() });
-            }
-
-            // Slow path: SIMD scan group.
+            // SIMD scan group for tag match.
             let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
-            tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() }.borrow() == key {
                     return Some(unsafe { group.values[i].assume_init_ref() });
                 }
             }
-
-            if group_ops::match_empty(&group.ctrl) != 0 {
+            if group.ctrl[GROUP_SIZE - 1] == CTRL_EMPTY {
                 return None;
             }
-
             if group.overflow == NO_OVERFLOW {
                 return None;
             }
@@ -269,43 +336,26 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
     /// of `&mut self` until any reallocation (`grow`).
     fn find_or_insertion_slot(&mut self, hash: u64, key: &K) -> FindResult<K, V> {
         let tag = tag(hash);
-        let hint = slot_hint(hash);
         let mut gi = self.group_index(hash);
 
         loop {
             let group = &mut self.groups[gi];
 
-            // Fast path: preferred slot.
-            let c = group.ctrl[hint];
-            if c == CTRL_EMPTY {
-                return FindResult::Vacant(Insertion::Empty {
-                    group: group as *mut _,
-                    slot: hint,
-                });
-            }
-            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == key {
-                return FindResult::Found(group.values[hint].as_mut_ptr());
-            }
-
-            // Slow path: SIMD scan group for tag match.
+            // SIMD scan group for tag match.
             let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
-            tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() } == key {
                     return FindResult::Found(group.values[i].as_mut_ptr());
                 }
             }
-
             // Check for empty slot in this group.
-            let empty_mask = group_ops::match_empty(&group.ctrl);
-            if empty_mask != 0 {
-                let i = group_ops::lowest(empty_mask);
+            let occupied_slots = group_ops::count_occupied(&group.ctrl);
+            if occupied_slots != GROUP_SIZE {
                 return FindResult::Vacant(Insertion::Empty {
                     group: group as *mut _,
-                    slot: i,
+                    slot: occupied_slots,
                 });
             }
-
             // Group full — follow or report end of chain.
             if group.overflow == NO_OVERFLOW {
                 return FindResult::Vacant(Insertion::NeedsOverflow {
@@ -331,8 +381,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
         self.len = 0;
 
         for group in &old_groups[..old_num_groups] {
-            let mut full_mask = group_ops::match_full(&group.ctrl);
-            while let Some(i) = group_ops::next_match(&mut full_mask) {
+            for i in 0..group_ops::count_occupied(&group.ctrl) {
                 let hash = self
                     .hash_builder
                     .hash_one(unsafe { group.keys[i].assume_init_ref() });
@@ -348,18 +397,13 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
 
     fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) {
         let tag = tag(hash);
-        let mut hint = slot_hint(hash);
         let gi = self.group_index(hash);
         let mut group = &mut self.groups[gi];
 
-        loop {
-            if group.ctrl[hint] == CTRL_EMPTY {
-                break;
-            }
-            let empty_mask = group_ops::match_empty(&group.ctrl);
-            if empty_mask != 0 {
-                hint = group_ops::lowest(empty_mask);
-                break;
+        let slot = loop {
+            let occupied = group_ops::count_occupied(&group.ctrl);
+            if occupied != GROUP_SIZE {
+                break occupied;
             }
             let overflow = group.overflow;
             if overflow != NO_OVERFLOW {
@@ -369,15 +413,15 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
                 group.overflow = new_gi as u32;
                 self.num_groups += 1;
                 group = &mut self.groups[new_gi];
-                break;
+                break 0;
             }
-        }
-        group.ctrl[hint] = tag;
+        };
+        group.ctrl[slot] = tag;
         unsafe {
-            group.keys[hint]
+            group.keys[slot]
                 .as_mut_ptr()
                 .copy_from_nonoverlapping(key_src, 1);
-            group.values[hint]
+            group.values[slot]
                 .as_mut_ptr()
                 .copy_from_nonoverlapping(value_src, 1);
         }
@@ -385,6 +429,14 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
     }
 }
 
+// ── Chain-slot helpers for sort_by_hash ─────────────────────────────────
+
+/// Map a flat position (0..chain.len()*GROUP_SIZE) to a (group_index, slot).
+#[inline]
+fn chain_slot(chain: &[u32], pos: usize) -> (usize, usize) {
+    (chain[pos / GROUP_SIZE] as usize, pos % GROUP_SIZE)
+}
+
 // ────────────────────────────────────────────────────────────────────────
 // Entry API
 // ────────────────────────────────────────────────────────────────────────
@@ -510,7 +562,7 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
                 let (new_gi, new_group) = unsafe {
                     let map = &mut *map;
                     if map.num_groups as usize == map.groups.len() {
-                        return insert_after_grow(map, hash, key, value);
+                        return insert_after_grow(map, key, value);
                     }
                     let new_gi = map.num_groups as usize;
                     map.num_groups += 1;
@@ -518,12 +570,12 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
                     (new_gi, new_group)
                 };
                 unsafe {
-                    // SAFETY: `tail` was obtained from `&mut self.groups[..]` and
+                    // SAFETY: `tail` was obtained from `&mut groups[..]` and
                     // remains valid because no reallocation occurred between
                     // `entry()` and now (we hold the only `&mut self`).
                     (*tail).overflow = new_gi as u32;
                 }
-                (new_group, slot_hint(hash))
+                (new_group, 0)
             }
         };
 
@@ -541,42 +593,17 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
 }
 
 /// Cold path: the chain was full, the table is at capacity, and we need to
-/// grow before inserting. Re-walks via the slow path after grow.
-///
-/// After `grow()` doubles `num_primary` (`n_bits += 1`), our key's new
-/// primary group can have at most ~half the old chain's keys, so hitting
-/// `NeedsOverflow` again would require `GROUP_SIZE` keys to all collide on
-/// one extra bit of hash — essentially impossible for any reasonable hash.
-/// (`insert_for_grow` relies on the same assumption to skip its own
-/// capacity check.)
+/// grow before inserting. Grows the map, then re-walks via `entry()` to find
+/// the new insertion slot.
 #[cold]
 #[inline(never)]
 fn insert_after_grow<K: Hash + Eq, V, S: BuildHasher>(
     map: &mut HashSortedMap<K, V, S>,
-    hash: u64,
     key: K,
     value: V,
 ) -> &mut V {
     map.grow();
-    match map.find_or_insertion_slot(hash, &key) {
-        FindResult::Vacant(Insertion::Empty { group, slot }) => {
-            let tag = tag(hash);
-            // SAFETY: `group` points into `map.groups` and is valid for `'a`.
-            unsafe {
-                let g = &mut *group;
-                g.ctrl[slot] = tag;
-                g.keys[slot] = MaybeUninit::new(key);
-                g.values[slot] = MaybeUninit::new(value);
-                map.len += 1;
-                g.values[slot].assume_init_mut()
-            }
-        }
-        // After grow, the new primary group for `key` cannot be full (see
-        // function docs), and the key wasn't in the table before grow.
-        FindResult::Vacant(Insertion::NeedsOverflow { .. }) | FindResult::Found(_) => {
-            unreachable!("post-grow walk must hit an empty slot")
-        }
-    }
+    map.entry(key).or_insert(value)
 }
 
 impl<K, V, S> Drop for HashSortedMap<K, V, S> {
@@ -812,4 +839,115 @@ mod tests {
             assert_eq!(m.get(&i), Some(&i));
         }
     }
+
+    // ── sort_by_hash tests ──────────────────────────────────────────────
+
+    #[test]
+    fn sort_by_hash_empty() {
+        let mut map: HashSortedMap<u32, u32> = HashSortedMap::new();
+        map.sort_by_hash();
+        assert_eq!(map.len(), 0);
+    }
+
+    #[test]
+    fn sort_by_hash_single() {
+        let mut map = HashSortedMap::new();
+        map.insert(42u32, "hello");
+        map.sort_by_hash();
+        assert_eq!(map.len(), 1);
+        let entries: Vec<_> = map.into_iter().collect();
+        assert_eq!(entries, vec![(42, "hello")]);
+    }
+
+    #[test]
+    fn sort_by_hash_preserves_entries() {
+        let mut map = HashSortedMap::new();
+        for i in 0..200u32 {
+            map.insert(i, i * 10);
+        }
+        map.sort_by_hash();
+        assert_eq!(map.len(), 200);
+        // Lookups must still work after sorting.
+        for i in 0..200u32 {
+            assert_eq!(map.get(&i), Some(&(i * 10)), "get failed for key {i}");
+        }
+        let mut entries: Vec<_> = map.into_iter().collect();
+        entries.sort_by_key(|&(k, _)| k);
+        for i in 0..200u32 {
+            assert_eq!(entries[i as usize], (i, i * 10), "missing key {i}");
+        }
+    }
+
+    #[test]
+    fn sort_by_hash_produces_hash_order() {
+        use std::collections::hash_map::RandomState;
+
+        let hasher = RandomState::new();
+        let mut map = HashSortedMap::with_hasher(hasher.clone());
+        for i in 0..500u32 {
+            map.insert(i, i);
+        }
+        map.sort_by_hash();
+        // Iteration should now yield entries in (hash, key) order.
+        let mut prev_hash = 0u64;
+        let mut prev_key = 0u32;
+        let mut first = true;
+        for (&k, _) in &map {
+            let h = hasher.hash_one(k);
+            if !first {
+                assert!(
+                    (h, k) >= (prev_hash, prev_key),
+                    "(hash, key) order violated: ({prev_hash:#x}, {prev_key}) > ({h:#x}, {k})"
+                );
+            }
+            prev_hash = h;
+            prev_key = k;
+            first = false;
+        }
+    }
+
+    #[test]
+    fn sort_by_hash_with_overflow() {
+        // Force overflow chains via fixed hash — all keys collide, so sort
+        // should produce key order as tie-breaker.
+        let mut map = HashSortedMap::with_capacity_and_hasher(1, FixedState(0));
+        for i in 0..50u32 {
+            map.insert(i, i);
+        }
+        map.sort_by_hash();
+        assert_eq!(map.len(), 50);
+        // All hashes are equal, so entries should be in key order.
+        let entries: Vec<_> = map.into_iter().collect();
+        for i in 0..50u32 {
+            assert_eq!(entries[i as usize], (i, i), "key order violated at {i}");
+        }
+    }
+
+    #[test]
+    fn sort_by_hash_with_strings() {
+        use std::collections::hash_map::RandomState;
+
+        let hasher = RandomState::new();
+        let mut map = HashSortedMap::with_hasher(hasher.clone());
+        for i in 0..100u32 {
+            map.insert(format!("key-{i}"), format!("val-{i}"));
+        }
+        map.sort_by_hash();
+        assert_eq!(map.len(), 100);
+        let mut prev_hash = 0u64;
+        let mut prev_key = String::new();
+        let mut first = true;
+        for (k, _) in &map {
+            let h = hasher.hash_one(k);
+            if !first {
+                assert!(
+                    (h, k) >= (prev_hash, &prev_key),
+                    "(hash, key) order violated"
+                );
+            }
+            prev_hash = h;
+            prev_key = k.clone();
+            first = false;
+        }
+    }
 }
diff --git a/crates/hash-sorted-map/src/iter.rs b/crates/hash-sorted-map/src/iter.rs
new file mode 100644
index 0000000..e981bad
--- /dev/null
+++ b/crates/hash-sorted-map/src/iter.rs
@@ -0,0 +1,408 @@
+use std::marker::PhantomData;
+use std::mem::ManuallyDrop;
+
+use crate::group_ops::{CTRL_EMPTY, GROUP_SIZE};
+
+use super::group::Group;
+use super::hash_sorted_map::{HashSortedMap, NO_OVERFLOW};
+
+/// State shared by `Iter`, `IterMut`, and `IntoIter`: tracks which primary
+/// group we're visiting and where we are within that group's overflow chain.
+struct IterCursor {
+    /// Index of the next primary group to visit (0..num_primary).
+    primary: u32,
+    /// Number of primary groups (1 << n_bits).
+    num_primary: u32,
+    /// Current position within the group we're scanning: group index in the
+    /// groups array, and a SIMD bitmask of remaining occupied slots.
+    current_group: u32,
+    current_slot: u32,
+}
+
+impl IterCursor {
+    fn new(n_bits: u32) -> Self {
+        let num_primary = 1u32 << n_bits;
+        Self {
+            primary: 0,
+            num_primary,
+            current_group: 0,
+            current_slot: 0,
+        }
+    }
+
+    /// Advance to the next occupied slot, returning `(group_index, slot)`.
+    /// Visits primary groups 0..num_primary in order; for each, follows the
+    /// overflow chain. Within each group, yields occupied slots via bitmask.
+    fn next_slot<K, V>(&mut self, groups: &[Group<K, V>]) -> Option<(usize, usize)> {
+        loop {
+            let gi = self.current_group as usize;
+            if self.current_slot < GROUP_SIZE as u32 {
+                let slot = self.current_slot;
+                if groups[gi].ctrl[slot as usize] != CTRL_EMPTY {
+                    self.current_slot += 1;
+                    return Some((gi, slot as usize));
+                }
+            }
+            // Current group exhausted — try overflow chain.
+            if gi < groups.len() && groups[gi].overflow != NO_OVERFLOW {
+                self.current_group = groups[gi].overflow;
+                self.current_slot = 0;
+                continue;
+            }
+            self.primary += 1;
+            // No more overflow — move to next primary group.
+            if self.primary >= self.num_primary {
+                return None;
+            }
+            self.current_group = self.primary;
+            self.current_slot = 0;
+        }
+    }
+}
+
+/// Immutable iterator over `(&K, &V)` pairs.
+pub struct Iter<'a, K, V> {
+    groups: &'a [Group<K, V>],
+    cursor: IterCursor,
+}
+
+impl<'a, K, V> Iterator for Iter<'a, K, V> {
+    type Item = (&'a K, &'a V);
+    fn next(&mut self) -> Option<Self::Item> {
+        let (gi, slot) = self.cursor.next_slot(self.groups)?;
+        let group = &self.groups[gi];
+        // SAFETY: slot is occupied (bitmask guarantees ctrl byte has high bit set).
+        unsafe {
+            Some((
+                group.keys[slot].assume_init_ref(),
+                group.values[slot].assume_init_ref(),
+            ))
+        }
+    }
+}
+
+/// Mutable iterator over `(&K, &mut V)` pairs.
+pub struct IterMut<'a, K, V> {
+    groups: *mut [Group<K, V>],
+    cursor: IterCursor,
+    _marker: PhantomData<&'a mut [Group<K, V>]>,
+}
+
+impl<'a, K, V> Iterator for IterMut<'a, K, V> {
+    type Item = (&'a K, &'a mut V);
+    fn next(&mut self) -> Option<Self::Item> {
+        // SAFETY: we use raw pointer to avoid holding multiple &mut borrows.
+        // The cursor guarantees each slot is yielded at most once.
+        let groups = unsafe { &mut *self.groups };
+        let (gi, slot) = self.cursor.next_slot(groups)?;
+        let group = &mut groups[gi];
+        unsafe {
+            Some((
+                group.keys[slot].assume_init_ref(),
+                group.values[slot].assume_init_mut(),
+            ))
+        }
+    }
+}
+
+/// Owning iterator that yields `(K, V)` pairs and consumes the map.
+pub struct IntoIter<K, V> {
+    groups: Box<[Group<K, V>]>,
+    len: usize,
+    cursor: IterCursor,
+}
+
+impl<K, V> Iterator for IntoIter<K, V> {
+    type Item = (K, V);
+    fn next(&mut self) -> Option<Self::Item> {
+        let (gi, slot) = self.cursor.next_slot(&self.groups)?;
+        let group = &self.groups[gi];
+        // SAFETY: slot is occupied (bitmask guarantees ctrl byte has high bit set).
+        unsafe {
+            Some((
+                group.keys[slot].assume_init_read(),
+                group.values[slot].assume_init_read(),
+            ))
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.len))
+    }
+}
+
+impl<K, V> Drop for IntoIter<K, V> {
+    fn drop(&mut self) {
+        // Continue iterating to drop remaining entries one by one.
+        while let Some((gi, slot)) = self.cursor.next_slot(&self.groups) {
+            unsafe {
+                self.groups[gi].keys[slot].assume_init_drop();
+                self.groups[gi].values[slot].assume_init_drop();
+            }
+        }
+    }
+}
+
+// ── HashSortedMap iteration ─────────────────────────────────────────────
+
+impl<K, V, S> HashSortedMap<K, V, S> {
+    /// Returns an iterator over `(&K, &V)` pairs.
+    ///
+    /// Entries are visited in group-index order (primary groups in order of
+    /// hash prefix, each followed by its overflow chain). Within each group,
+    /// occupied slots are visited in slot order.
+    pub fn iter(&self) -> Iter<'_, K, V> {
+        Iter {
+            groups: &self.groups,
+            cursor: IterCursor::new(self.n_bits),
+        }
+    }
+
+    /// Returns a mutable iterator over `(&K, &mut V)` pairs.
+    pub fn iter_mut(&mut self) -> IterMut<'_, K, V> {
+        let cursor = IterCursor::new(self.n_bits);
+        IterMut {
+            groups: &mut *self.groups as *mut [Group<K, V>],
+            cursor,
+            _marker: PhantomData,
+        }
+    }
+
+    /// Consumes the map and returns an iterator over `(K, V)` pairs.
+    #[allow(clippy::should_implement_trait)]
+    pub fn into_iter(self) -> IntoIter<K, V> {
+        let cursor = IterCursor::new(self.n_bits);
+        // Prevent Drop from running on self — we're moving groups out.
+        let mut this = ManuallyDrop::new(self);
+        let groups = unsafe { std::ptr::read(&this.groups) };
+        let len = this.len;
+        // Zero out len so if Drop somehow runs it sees an empty map.
+        this.len = 0;
+        IntoIter {
+            groups,
+            len,
+            cursor,
+        }
+    }
+}
+
+impl<K, V, S> IntoIterator for HashSortedMap<K, V, S> {
+    type Item = (K, V);
+    type IntoIter = IntoIter<K, V>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.into_iter()
+    }
+}
+
+impl<'a, K, V, S> IntoIterator for &'a HashSortedMap<K, V, S> {
+    type Item = (&'a K, &'a V);
+    type IntoIter = Iter<'a, K, V>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+impl<'a, K, V, S> IntoIterator for &'a mut HashSortedMap<K, V, S> {
+    type Item = (&'a K, &'a mut V);
+    type IntoIter = IterMut<'a, K, V>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter_mut()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::hash::{BuildHasher, Hasher};
+
+    use super::*;
+
+    /// Degenerate hasher that returns a fixed hash code, for forcing collisions.
+    struct FixedHasher(u64);
+
+    impl Hasher for FixedHasher {
+        fn finish(&self) -> u64 {
+            self.0
+        }
+        fn write(&mut self, _bytes: &[u8]) {}
+    }
+
+    #[derive(Clone)]
+    struct FixedState(u64);
+
+    impl BuildHasher for FixedState {
+        type Hasher = FixedHasher;
+        fn build_hasher(&self) -> FixedHasher {
+            FixedHasher(self.0)
+        }
+    }
+
+    #[test]
+    fn iter_empty() {
+        let map: HashSortedMap<u32, u32> = HashSortedMap::new();
+        assert_eq!(map.iter().count(), 0);
+    }
+
+    #[test]
+    fn iter_yields_all_entries() {
+        let mut map = HashSortedMap::new();
+        for i in 0..100u32 {
+            map.insert(i, i * 10);
+        }
+        let mut collected: Vec<(u32, u32)> = map.iter().map(|(&k, &v)| (k, v)).collect();
+        collected.sort();
+        assert_eq!(collected.len(), 100);
+        for i in 0..100u32 {
+            assert_eq!(collected[i as usize], (i, i * 10));
+        }
+    }
+
+    #[test]
+    fn iter_with_overflow_chains() {
+        let mut map = HashSortedMap::with_capacity_and_hasher(1, FixedState(0xABCD));
+        for i in 0..50u32 {
+            map.insert(i, i);
+        }
+        let collected: Vec<u32> = map.iter().map(|(&k, _)| k).collect();
+        assert_eq!(collected.len(), 50);
+        let mut sorted = collected.clone();
+        sorted.sort();
+        sorted.dedup();
+        assert_eq!(sorted.len(), 50);
+    }
+
+    #[test]
+    fn iter_mut_mutates_values() {
+        let mut map = HashSortedMap::new();
+        for i in 0..20u32 {
+            map.insert(i, i);
+        }
+        for (_, v) in map.iter_mut() {
+            *v *= 2;
+        }
+        for i in 0..20u32 {
+            assert_eq!(map.get(&i), Some(&(i * 2)));
+        }
+    }
+
+    #[test]
+    fn into_iter_yields_all() {
+        let mut map = HashSortedMap::new();
+        for i in 0..100u32 {
+            map.insert(i, i * 3);
+        }
+        let mut collected: Vec<(u32, u32)> = map.into_iter().collect();
+        collected.sort();
+        assert_eq!(collected.len(), 100);
+        for i in 0..100u32 {
+            assert_eq!(collected[i as usize], (i, i * 3));
+        }
+    }
+
+    #[test]
+    fn into_iter_partial_consume_then_drop() {
+        let mut map: HashSortedMap<String, String> = HashSortedMap::new();
+        for i in 0..50u32 {
+            map.insert(format!("key-{i}"), format!("val-{i}"));
+        }
+        let mut iter = map.into_iter();
+        for _ in 0..10 {
+            let _ = iter.next();
+        }
+        drop(iter);
+    }
+
+    #[test]
+    fn into_iter_empty() {
+        let map: HashSortedMap<u32, u32> = HashSortedMap::new();
+        assert_eq!(map.into_iter().count(), 0);
+    }
+
+    #[test]
+    fn into_iter_with_overflow() {
+        let mut map = HashSortedMap::with_capacity_and_hasher(1, FixedState(0));
+        for i in 0..80u32 {
+            map.insert(i, i);
+        }
+        let collected: Vec<(u32, u32)> = map.into_iter().collect();
+        assert_eq!(collected.len(), 80);
+        let mut keys: Vec<u32> = collected.into_iter().map(|(k, _)| k).collect();
+        keys.sort();
+        keys.dedup();
+        assert_eq!(keys.len(), 80);
+    }
+
+    #[test]
+    fn into_iter_after_grow() {
+        let mut map = HashSortedMap::with_capacity(1);
+        for i in 0..500u32 {
+            map.insert(i, i);
+        }
+        let collected: Vec<(u32, u32)> = map.into_iter().collect();
+        assert_eq!(collected.len(), 500);
+    }
+
+    /// Track drops to verify no leaks or double-drops.
+    #[test]
+    fn into_iter_drop_count() {
+        use std::cell::Cell;
+        use std::rc::Rc;
+
+        #[derive(Clone)]
+        struct Tracked(Rc<Cell<usize>>);
+        impl Drop for Tracked {
+            fn drop(&mut self) {
+                self.0.set(self.0.get() + 1);
+            }
+        }
+
+        let counter = Rc::new(Cell::new(0usize));
+        let n = 100;
+        {
+            let mut map = HashSortedMap::new();
+            for i in 0..n {
+                map.insert(i, Tracked(counter.clone()));
+            }
+            let mut iter = map.into_iter();
+            for _ in 0..n / 2 {
+                let _ = iter.next();
+            }
+        }
+        assert_eq!(counter.get(), n);
+    }
+
+    #[test]
+    fn for_loop_ref() {
+        let mut map = HashSortedMap::new();
+        map.insert(1, "a");
+        map.insert(2, "b");
+        let mut count = 0;
+        for (_k, _v) in &map {
+            count += 1;
+        }
+        assert_eq!(count, 2);
+    }
+
+    #[test]
+    fn for_loop_mut() {
+        let mut map = HashSortedMap::new();
+        map.insert(1u32, 10u32);
+        map.insert(2, 20);
+        for (_, v) in &mut map {
+            *v += 1;
+        }
+        assert_eq!(map.get(&1), Some(&11));
+        assert_eq!(map.get(&2), Some(&21));
+    }
+
+    #[test]
+    fn for_loop_owned() {
+        let mut map = HashSortedMap::new();
+        map.insert(1, 10);
+        map.insert(2, 20);
+        let mut sum = 0;
+        for (_k, v) in map {
+            sum += v;
+        }
+        assert_eq!(sum, 30);
+    }
+}
diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs
index 79dac69..3ff5461 100644
--- a/crates/hash-sorted-map/src/lib.rs
+++ b/crates/hash-sorted-map/src/lib.rs
@@ -1,4 +1,7 @@
+mod group;
 mod group_ops;
 mod hash_sorted_map;
+mod iter;
 
 pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacantEntry};
+pub use iter::{IntoIter, Iter, IterMut};