Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: Rust CI

on:
push:
branches: ["main"]
branches: ["main", "release-v0.6"]
pull_request:
branches: ["main"]
branches: ["main", "release-v0.6"]

env:
CARGO_TERM_COLOR: always
Expand All @@ -17,14 +17,14 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest-xlarge, windows-latest]
os: [ubuntu-latest, macos-latest, windows-latest]
rust:
- stable
- beta

steps:
- uses: actions/checkout@v1
- uses: dtolnay/rust-toolchain@1.81
- uses: dtolnay/rust-toolchain@1.85
- uses: actions-rs/cargo@v1
with:
command: build
Expand All @@ -41,7 +41,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: dtolnay/rust-toolchain@1.81
- uses: dtolnay/rust-toolchain@1.85
with:
profile: minimal
components: clippy, rustfmt
Expand Down
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "arroy"
description = "Annoy-inspired Approximate Nearest Neighbors in Rust, based on LMDB and optimized for memory usage"
version = "0.6.1"
version = "0.6.3"
documentation = "https://docs.rs/arroy"
repository = "https://github.com/meilisearch/arroy"
keywords = ["ANN-search", "Graph-algorithms", "Vector-Search", "Store"]
Expand Down Expand Up @@ -31,9 +31,11 @@ enum-iterator = "2.1.0"

[dev-dependencies]
anyhow = "1.0.95"
approx = "0.5.1"
arbitrary = { version = "1.4.1", features = ["derive"] }
clap = { version = "4.5.24", features = ["derive"] }
env_logger = "0.11.6"
hannoy = "0.0.4"
insta = "1.42.0"
instant-distance = "0.6.1"
proptest = "1.6.0"
Expand Down
19 changes: 18 additions & 1 deletion src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,24 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec<D> {
[DESCENDANTS_TAG, bytes @ ..] => Ok(Node::Descendants(Descendants {
descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?),
})),
unknown => panic!("What the fuck is an {unknown:?}"),
[unknown_tag, ..] => {
Err(Box::new(InvalidNodeDecoding { unknown_tag: Some(*unknown_tag) }))
}
[] => Err(Box::new(InvalidNodeDecoding { unknown_tag: None })),
}
}
}

#[derive(Debug, thiserror::Error)]
pub struct InvalidNodeDecoding {
unknown_tag: Option<u8>,
}

impl fmt::Display for InvalidNodeDecoding {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.unknown_tag {
Some(unknown_tag) => write!(f, "Invalid node decoding: unknown tag {unknown_tag}"),
None => write!(f, "Invalid node decoding: empty array of bytes"),
}
}
}
106 changes: 63 additions & 43 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,53 +286,73 @@ impl<'t, D: Distance> Reader<'t, D> {
if self.items.is_empty() {
return Ok(Vec::new());
}
// Since the datastructure describes a kind of btree, the capacity is something in the order of:
// The number of root nodes + log2 of the total number of vectors.
let mut queue =
BinaryHeap::with_capacity(self.roots.len() + self.items.len().ilog2() as usize);
let search_k = opt.search_k.map_or(opt.count * self.roots.len(), NonZeroUsize::get);
let search_k = opt
.oversampling
.map_or(search_k.saturating_mul(D::DEFAULT_OVERSAMPLING), |oversampling| {
search_k.saturating_mul(oversampling.get())
});

// Insert all the root nodes and associate them to the highest distance.
queue.extend(repeat(OrderedFloat(f32::INFINITY)).zip(self.roots.iter().map(NodeId::tree)));
let candidates = opt.candidates.map(|candidates| candidates & &self.items);

let mut nns = Vec::new();
while nns.len() < search_k {
let (OrderedFloat(dist), item) = match queue.pop() {
Some(out) => out,
None => break,
};

let key = Key::new(self.index, item);
match self.database.get(rtxn, &key)?.ok_or(Error::missing_key(key))? {
Node::Leaf(_) => {
if opt.candidates.map_or(true, |c| c.contains(item.item)) {
nns.push(item.unwrap_item());
}
}
Node::Descendants(Descendants { descendants }) => {
if let Some(candidates) = opt.candidates {
nns.extend((descendants.into_owned() & candidates).iter());
} else {
nns.extend(descendants.iter());
let nns = match candidates {
// When we're filtering on 5% or less of the database we don't use the trees and
// just sort every candidates by hand
Some(candidates) if (candidates.len() as f32 / self.items.len() as f32) < 0.5 => {
candidates.iter().collect()
}
_ => {
// Since the datastructure describes a kind of btree, the capacity is something in the order of:
// The number of root nodes + log2 of the total number of vectors.
let mut queue =
BinaryHeap::with_capacity(self.roots.len() + self.items.len().ilog2() as usize);
let search_k = opt.search_k.map_or(opt.count * self.roots.len(), NonZeroUsize::get);
let search_k = opt
.oversampling
.map_or(search_k.saturating_mul(D::DEFAULT_OVERSAMPLING), |oversampling| {
search_k.saturating_mul(oversampling.get())
});

// Insert all the root nodes and associate them to the highest distance.
queue.extend(
repeat(OrderedFloat(f32::INFINITY)).zip(self.roots.iter().map(NodeId::tree)),
);

let mut nns = Vec::new();
while nns.len() < search_k {
let (OrderedFloat(dist), item) = match queue.pop() {
Some(out) => out,
None => break,
};

let key = Key::new(self.index, item);
match self.database.get(rtxn, &key)?.ok_or(Error::missing_key(key))? {
Node::Leaf(_) => {
if opt.candidates.is_none_or(|c| c.contains(item.item)) {
nns.push(item.unwrap_item());
}
}
Node::Descendants(Descendants { descendants }) => {
if let Some(candidates) = opt.candidates {
nns.extend((descendants.into_owned() & candidates).iter());
} else {
nns.extend(descendants.iter());
}
}
Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => {
let margin = D::margin_no_header(&normal, &query_leaf.vector);
queue.push((
OrderedFloat(D::pq_distance(dist, margin, Side::Left)),
left,
));
queue.push((
OrderedFloat(D::pq_distance(dist, margin, Side::Right)),
right,
));
}
}
}
Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => {
let margin = D::margin_no_header(&normal, &query_leaf.vector);
queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Left)), left));
queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Right)), right));
}
}
}

// Get distances for all items
// To avoid calculating distance multiple times for any items, sort by id and dedup by id.
nns.sort_unstable();
nns.dedup();
// Get distances for all items
// To avoid calculating distance multiple times for any items, sort by id and dedup by id.
nns.sort_unstable();
nns.dedup();
nns
}
};

let mut nns_distances = Vec::with_capacity(nns.len());
for nn in nns {
Expand Down
2 changes: 1 addition & 1 deletion src/tests/binary_quantized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fn write_and_retrieve_binary_quantized_vector() {
==================
Dumping index 0
Root: Metadata { dimensions: 16, items: RoaringBitmap<[0]>, roots: [0], distance: "binary quantized euclidean" }
Version: Version { major: 0, minor: 6, patch: 1 }
Version: Version { major: 0, minor: 6, patch: 3 }
Tree 0: Descendants(Descendants { descendants: [0] })
Item 0: Leaf(Leaf { header: NodeHeaderBinaryQuantizedEuclidean { bias: 0.0 }, vector: [-1.0000, -1.0000, 1.0000, -1.0000, 1.0000, 1.0000, -1.0000, 1.0000, -1.0000, -1.0000, "other ..."] })
"###);
Expand Down
Loading
Loading