Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
407c5e4
Use the nested-rtxns version of heed and arroy
Kerollmops Dec 4, 2025
737438a
Bump the version of hannoy
Kerollmops Dec 4, 2025
133fab2
Bump the Rust toolchain
Kerollmops Dec 4, 2025
5120457
Make clippy happy
Kerollmops Dec 4, 2025
6a15f4e
Cargo fmt everything
Kerollmops Dec 4, 2025
46c0c22
Fix snapshots
Kerollmops Dec 16, 2025
1eae164
Bump version to 0.1.3-nested-rtxns
Kerollmops Jan 5, 2026
ebc162d
Uses explore factor instead of count to stop the search
ManyTheFish Jan 13, 2026
478838f
Update src/reader.rs
ManyTheFish Jan 14, 2026
1b53f4a
Bump the version to 0.1.4-nested-rtxns
Kerollmops Jan 14, 2026
3704936
Fix insta snapshots
Kerollmops Jan 14, 2026
0f2ef76
Introduce new parameters on the query builder to control linear scanning
Kerollmops Jan 16, 2026
ca60b8a
Use a binary heap when linear searching
Kerollmops Jan 16, 2026
acaa84f
Change filtering logic
nnethercott Jan 19, 2026
c3d4e39
Bump version to 0.1.5-nested-rtxns
Kerollmops Jan 20, 2026
aecfc9c
Update the insta snapshots
Kerollmops Jan 20, 2026
9a8e43e
Merge pull request #121 from nnethercott/sort-on-small-subset
Kerollmops Jan 20, 2026
9ca4936
Update a small comment
Kerollmops Jan 20, 2026
c6ce46b
No longer compute the man degree
Kerollmops Feb 23, 2026
d8dce33
Change info to trace log levels
Kerollmops Feb 23, 2026
70e7ed7
Merge pull request #124 from nnethercott/speed-up-build
Kerollmops Feb 23, 2026
e778bb4
Bump version to 0.1.6-nested-rtxns
Kerollmops Feb 23, 2026
20ddc20
Make clippy happy
Kerollmops Feb 23, 2026
edccb28
Fix reading metadata of empty database
Kerollmops Feb 25, 2026
9a9f64b
Remove the ImmutableItems, ImmutableLinks, and FrozenReader
Kerollmops Feb 23, 2026
e6bc896
Reintroduce the FrozenReader but based on nested read txns
Kerollmops Feb 24, 2026
b145170
Pull nested rtxns from a pool
Kerollmops Feb 25, 2026
4476b91
Prefer using prefix iter when possible
Kerollmops Feb 25, 2026
b7cd801
Avoid database full scans
Kerollmops Feb 26, 2026
e0e1cd3
Introduce the update status stone and avoid unecessary vector lookups
Kerollmops Feb 26, 2026
2738f87
Merge pull request #125 from nnethercott/replace-fetching-items-by-ne…
Kerollmops Mar 5, 2026
7fda4b0
Bump version to 0.1.7-nested-rtxns
Kerollmops Mar 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
- beta
steps:
- uses: actions/checkout@v1
- uses: dtolnay/rust-toolchain@1.85
- uses: dtolnay/rust-toolchain@1.89
- uses: actions-rs/cargo@v1
with:
command: build
Expand All @@ -39,7 +39,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: dtolnay/rust-toolchain@1.85
- uses: dtolnay/rust-toolchain@1.89
with:
profile: minimal
components: clippy, rustfmt
Expand All @@ -61,7 +61,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: dtolnay/rust-toolchain@1.85
- uses: dtolnay/rust-toolchain@1.89
- name: Run fuzzer
env:
HANNOY_FUZZ_DURATION_SEC: 1800
Expand Down
9 changes: 5 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "hannoy"
description = "HNSW Approximate Nearest Neighbors in Rust, based on LMDB and optimized for memory usage"
version = "0.1.2"
version = "0.1.7-nested-rtxns"
repository = "https://github.com/nnethercott/hannoy"
keywords = [
"HNSW",
Expand All @@ -27,7 +27,7 @@ crate-type = ["cdylib", "rlib"]
bytemuck = { version = "1.21.0", features = ["derive", "extern_crate_alloc"] }
byteorder = "1.5.0"
hashbrown = "0.15.4"
heed = { version = "0.22.0", default-features = false }
heed = { version = "0.22.1-nested-rtxns", default-features = false }
min-max-heap = "1.3.0"
page_size = "0.6.0"
papaya = "0.2.3"
Expand All @@ -44,7 +44,8 @@ pyo3-stub-gen = { version = "0.13.1", optional = true }
once_cell = { version = "1.21.3", optional = true }
tempfile = { version = "3.21.0", optional = true }
parking_lot = { version = "0.12.4", optional = true }

thread_local = "1.1.9"
crossbeam-channel = "0.5.15"

[target.'cfg(not(windows))'.dependencies]
madvise = "0.1.0"
Expand All @@ -53,7 +54,7 @@ madvise = "0.1.0"
anyhow = "1.0.95"
approx = "0.5.1"
arbitrary = { version = "1.4.1", features = ["derive"] }
arroy = "0.6.1"
arroy = { version = "0.6.4-nested-rtxns", git = "https://github.com/meilisearch/arroy", "tag" = "v0.6.4-nested-rtxns", default-features = false }
clap = { version = "4.5.24", features = ["derive"] }
divan = { version = "3.0.5", package = "codspeed-divan-compat" }
hnsw_rs = "0.3.2"
Expand Down
6 changes: 4 additions & 2 deletions benches/benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use hannoy::{distances::Cosine, Database, Writer};
use hannoy::distances::Cosine;
use hannoy::{Database, Writer};
use heed::{Env, EnvOpenOptions, RwTxn};
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tempfile::tempdir;

static M: usize = 16;
Expand Down
8 changes: 4 additions & 4 deletions benches/speed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ use std::fs::OpenOptions;
use std::hint::black_box;
use std::io::Write;

use hannoy::Reader;
use hannoy::{distances::Cosine, Database, Writer};
use hannoy::distances::Cosine;
use hannoy::{Database, Reader, Writer};
use heed::{Env, EnvOpenOptions, RwTxn};
use hnsw_rs;
use hnsw_rs::hnsw::Hnsw;
use hnsw_rs::prelude::DistCosine;
use rand::thread_rng;
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand::rngs::StdRng;
use rand::{thread_rng, Rng, SeedableRng};
use tempfile::tempdir;

static M: usize = 16;
Expand Down
5 changes: 3 additions & 2 deletions src/distance/hamming.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::fmt;

use bytemuck::{Pod, Zeroable};

use crate::distance::Distance;
use crate::node::Item;
use crate::unaligned_vector::{Binary, UnalignedVector};
use bytemuck::{Pod, Zeroable};

/// The Hamming distance between two vectors is the number of positions at
/// which the corresponding symbols are different.
Expand Down Expand Up @@ -70,7 +71,7 @@ pub fn hamming_bitwise_fast(u: &[u8], v: &[u8]) -> f32 {
})
.sum::<u32>();

if u.len() % CHUNK_SIZE != 0 {
if !u.len().is_multiple_of(CHUNK_SIZE) {
distance += u
.chunks_exact(CHUNK_SIZE)
.remainder()
Expand Down
5 changes: 4 additions & 1 deletion src/error.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::io;

use crate::{key::Key, node_id::NodeMode, version::Version, ItemId, LayerId};
use crate::key::Key;
use crate::node_id::NodeMode;
use crate::version::Version;
use crate::{ItemId, LayerId};

/// The different set of errors that hannoy can encounter.
#[derive(Debug, thiserror::Error)]
Expand Down
43 changes: 21 additions & 22 deletions src/hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ use tracing::{debug, instrument};
use crate::key::Key;
use crate::node::{Item, Links, Node};
use crate::ordered_float::OrderedFloat;
use crate::parallel::{ImmutableItems, ImmutableLinks};
use crate::parallel::FrozenReader;
use crate::progress::{AtomicInsertItemsStep, HannoyBuild};
use crate::stats::BuildStats;
use crate::writer::{BuildOption, FrozenReader};
use crate::writer::BuildOption;
use crate::{Database, Distance, Error, ItemId, Result, CANCELLATION_PROBING};

pub(crate) type ScoredLink = (OrderedFloat, ItemId);
Expand Down Expand Up @@ -135,9 +135,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
{
let mut build_stats = BuildStats::new();

let items = ImmutableItems::new(wtxn, database, index, options)?;
let links = ImmutableLinks::new(wtxn, database, index, database.len(wtxn)?, options)?;
let lmdb = FrozenReader { index, items: &items, links: &links };
let lmdb = FrozenReader::new(wtxn, index, database)?;

// Generate a random level for each point
let mut cur_max_level = usize::MIN;
Expand Down Expand Up @@ -173,7 +171,8 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>

level_groups.into_iter().try_for_each(|grp| {
grp.into_par_iter().try_for_each(|&(item_id, lvl)| {
if cancel_index.fetch_add(1, Relaxed) % CANCELLATION_PROBING == 0 && (self.cancel)()
if cancel_index.fetch_add(1, Relaxed).is_multiple_of(CANCELLATION_PROBING)
&& (self.cancel)()
{
Err(Error::BuildCancelled)
} else {
Expand All @@ -187,6 +186,8 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>

self.fill_gaps_from_deleted(&lmdb, to_delete, options)?;

drop(lmdb);

// Single-threaded write to lmdb
options.progress.update(HannoyBuild::WritingTheItems);
let mut cancellation_index = 0;
Expand All @@ -211,14 +212,13 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
}
}

build_stats.compute_mean_degree(wtxn, &database, index)?;
Ok(build_stats)
}

/// This function resolves several nasty edge cases that can occur, namely : deleted
/// or partially deleted entrypoints, new indexed points assigned to higher layers, ensuring
/// entry points are present on all layers before build
#[instrument(skip(self, options, lmdb, levels))]
#[instrument(level = "trace", skip(self, options, lmdb, levels))]
fn prepare_levels_and_entry_points<P>(
&mut self,
levels: &mut Vec<(u32, usize)>,
Expand All @@ -242,7 +242,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
let mut l = self.max_level;
for _ in del_eps.iter() {
loop {
for result in lmdb.links.iter_layer(l as u8) {
for result in lmdb.iter_layer_links(l as u8)? {
let ((item_id, _), _) = result?;

if !to_delete.contains(item_id) && new_eps.insert(item_id) {
Expand Down Expand Up @@ -297,7 +297,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
) -> Result<()> {
let mut eps = Vec::from_iter(self.entry_points.clone());

let q = lmdb.get_item(query)?;
let q = lmdb.item(query)?;

// Greedy search with: ef = 1
for lvl in (level + 1..=self.max_level).rev() {
Expand Down Expand Up @@ -333,7 +333,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
/// Algorithm 4 from FreshDiskANN paper.
fn fill_gaps_from_deleted<P>(
&mut self,
lmdb: &FrozenReader<D>,
lmdb: &FrozenReader<'_, D>,
to_delete: &RoaringBitmap,
options: &BuildOption<P>,
) -> Result<()>
Expand All @@ -344,8 +344,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
options.progress.update(HannoyBuild::PatchOldNewDeletedLinks);

let links_in_db: Vec<_> = lmdb
.links
.iter()
.iter_links()?
.map(|result| {
result.map(|((id, lvl), v)| {
// Resize the layers if necessary. We must do this to accomodate links from
Expand All @@ -361,7 +360,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
let cancel_index = AtomicUsize::new(0);

links_in_db.into_par_iter().try_for_each(|result| {
if cancel_index.fetch_add(1, Ordering::Relaxed) % CANCELLATION_PROBING == 0
if cancel_index.fetch_add(1, Ordering::Relaxed).is_multiple_of(CANCELLATION_PROBING)
&& (self.cancel)()
{
return Err(Error::BuildCancelled);
Expand All @@ -382,7 +381,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>

let mut bitmap = RoaringBitmap::new();
for item_id in del_subset.iter() {
bitmap.extend(lmdb.get_links(item_id, lvl).unwrap_or_default().iter());
bitmap.extend(lmdb.links(item_id, lvl).unwrap_or_default().iter());
}
bitmap |= links;
bitmap -= to_delete;
Expand All @@ -401,10 +400,10 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
}

// Case 2: Some old links may be popped to fill gaps from deleted nodes
let curr = &lmdb.get_item(id)?;
let curr = &lmdb.item(id)?;

for other in bitmap {
let dist = D::distance(curr, &lmdb.get_item(other)?);
let dist = D::distance(curr, &lmdb.item(other)?);
new_links.push((OrderedFloat(dist), other));
}
let pruned = self.robust_prune(new_links, lvl, self.alpha, lmdb)?;
Expand Down Expand Up @@ -436,7 +435,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
let mut res = Vec::new();

// O(1) from frozzenreader
if let Ok(Links { links }) = lmdb.get_links(item_id, level) {
if let Ok(Links { links }) = lmdb.links(item_id, level) {
build_stats.incr_lmdb_hits();
res.extend(links.iter());
}
Expand All @@ -457,7 +456,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
}

#[allow(clippy::too_many_arguments)]
#[instrument(name = "walk_layer", skip(self, lmdb, query))]
#[instrument(level = "trace", name = "walk_layer", skip(self, lmdb, query))]
fn walk_layer(
&self,
query: &Item<D>,
Expand All @@ -473,7 +472,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>

// Register all entry points as visited and populate candidates
for &ep in eps {
let ve = lmdb.get_item(ep)?;
let ve = lmdb.item(ep)?;
let dist = D::distance(query, &ve);

candidates.push((Reverse(OrderedFloat(dist)), ep));
Expand All @@ -496,7 +495,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
}
// If the item isn't in the frozzen reader it must have been deleted from the index,
// in which case its OK not to explore it
let item = match lmdb.get_item(point) {
let item = match lmdb.item(point) {
Ok(item) => item,
Err(Error::MissingKey { .. }) => continue,
Err(e) => return Err(e),
Expand Down Expand Up @@ -582,7 +581,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
// ensure we're closer to the query than we are to other candidates
let mut ok_to_add = true;
for i in selected.iter().map(|(_, i)| *i) {
let d = D::distance(&lmdb.get_item(c)?, &lmdb.get_item(i)?);
let d = D::distance(&lmdb.item(c)?, &lmdb.item(i)?);
if OrderedFloat(d * alpha) < dist_to_query {
ok_to_add = false;
break;
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ mod reader;
mod roaring;
mod spaces;
mod stats;
mod update_status;
mod version;
mod writer;

Expand Down
12 changes: 10 additions & 2 deletions src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,16 @@ impl<'a> heed::BytesDecode<'a> for MetadataCodec {
let bytes = &bytes[size_of::<u32>()..];
let items = RoaringBitmap::deserialize_from(&bytes[..items_size])?;
let bytes = &bytes[items_size..];
let entry_points = ItemIds::from_bytes(&bytes[..bytes.len() - 1]);
let max_level = bytes[bytes.len() - 1];

let entry_points;
let max_level;
if bytes.is_empty() {
entry_points = ItemIds::from_slice(&[]);
max_level = 0;
} else {
entry_points = ItemIds::from_bytes(&bytes[..bytes.len() - 1]);
max_level = bytes[bytes.len() - 1];
};

Ok(Metadata { dimensions, items, distance, entry_points, max_level })
}
Expand Down
10 changes: 7 additions & 3 deletions src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,15 @@ impl fmt::Display for InvalidNodeDecoding {

#[cfg(test)]
mod tests {
use super::{Item, Links, Node, NodeCodec};
use crate::{distance::Cosine, internals::UnalignedVector, Distance};
use std::borrow::Cow;

use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
use std::borrow::Cow;

use super::{Item, Links, Node, NodeCodec};
use crate::distance::Cosine;
use crate::internals::UnalignedVector;
use crate::Distance;

#[test]
fn check_bytes_encode_decode() {
Expand Down
Loading
Loading