Skip to content

Commit f3b594a

Browse files
authored
Merge pull request #127 from meilisearch/stop-pointing-to-items-in-split-nodes
Make splitnode only refers to tree nodes
2 parents 7e783f8 + 7639506 commit f3b594a

16 files changed

+1090
-525
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "arroy"
33
description = "Annoy-inspired Approximate Nearest Neighbors in Rust, based on LMDB and optimized for memory usage"
4-
version = "0.6.1"
4+
version = "0.7.0"
55
documentation = "https://docs.rs/arroy"
66
repository = "https://github.com/meilisearch/arroy"
77
keywords = ["ANN-search", "Graph-algorithms", "Vector-Search", "Store"]

src/error.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::io;
22

3-
use crate::{key::Key, node_id::NodeMode, ItemId};
3+
use crate::{key::Key, node_id::NodeMode, version::Version, ItemId};
44

55
/// The different set of errors that arroy can encounter.
66
#[derive(Debug, thiserror::Error)]
@@ -72,6 +72,13 @@ pub enum Error {
7272
/// The mode that couldn't be decoded.
7373
mode: NodeMode,
7474
},
75+
76+
/// Unknown version
77+
#[error("Unknown version: v{version}")]
78+
UnknownVersion {
79+
/// The version that is unknown.
80+
version: Version,
81+
},
7582
}
7683

7784
impl Error {

src/node.rs

Lines changed: 151 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ use std::fmt;
33
use std::mem::size_of;
44

55
use bytemuck::{bytes_of, cast_slice, pod_read_unaligned};
6-
use byteorder::{ByteOrder, NativeEndian};
6+
use byteorder::{BigEndian, ByteOrder, NativeEndian};
77
use heed::{BoxedError, BytesDecode, BytesEncode};
88
use roaring::RoaringBitmap;
99

1010
use crate::distance::Distance;
11+
use crate::node_id::NodeId;
1112
use crate::unaligned_vector::UnalignedVector;
12-
use crate::{ItemId, NodeId};
13+
use crate::ItemId;
1314

1415
#[derive(Clone, Debug)]
1516
pub enum Node<'a, D: Distance> {
@@ -18,6 +19,15 @@ pub enum Node<'a, D: Distance> {
1819
SplitPlaneNormal(SplitPlaneNormal<'a, D>),
1920
}
2021

22+
/// A node generic over the version of the database.
23+
/// Should only be used while reading from the database.
24+
#[derive(Clone, Debug)]
25+
pub enum GenericReadNode<'a, D: Distance> {
26+
Leaf(Leaf<'a, D>),
27+
Descendants(Descendants<'a>),
28+
SplitPlaneNormal(GenericReadSplitPlaneNormal<'a, D>),
29+
}
30+
2131
const LEAF_TAG: u8 = 0;
2232
const DESCENDANTS_TAG: u8 = 1;
2333
const SPLIT_PLANE_NORMAL_TAG: u8 = 2;
@@ -113,8 +123,8 @@ impl fmt::Debug for ItemIds<'_> {
113123
}
114124

115125
pub struct SplitPlaneNormal<'a, D: Distance> {
116-
pub left: NodeId,
117-
pub right: NodeId,
126+
pub left: ItemId,
127+
pub right: ItemId,
118128
pub normal: Option<Cow<'a, UnalignedVector<D::VectorCodec>>>,
119129
}
120130

@@ -138,6 +148,35 @@ impl<D: Distance> Clone for SplitPlaneNormal<'_, D> {
138148
}
139149
}
140150

151+
pub struct GenericReadSplitPlaneNormal<'a, D: Distance> {
152+
// Before version 0.7.0 the split plane normal was stored as a `NodeId` and could point directly to items.
153+
pub left: NodeId,
154+
pub right: NodeId,
155+
// Before version 0.7.0 instead of storing `None` for a missing normal, we were
156+
// storing a vector filled with zeros, that will be overwritten while creating this type.
157+
pub normal: Option<Cow<'a, UnalignedVector<D::VectorCodec>>>,
158+
}
159+
160+
impl<D: Distance> fmt::Debug for GenericReadSplitPlaneNormal<'_, D> {
161+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
162+
let name = format!("GenericReadSplitPlaneNormal<{}>", D::name());
163+
let mut debug = f.debug_struct(&name);
164+
165+
debug.field("left", &self.left).field("right", &self.right);
166+
match &self.normal {
167+
Some(normal) => debug.field("normal", &normal),
168+
None => debug.field("normal", &"none"),
169+
};
170+
debug.finish()
171+
}
172+
}
173+
174+
impl<D: Distance> Clone for GenericReadSplitPlaneNormal<'_, D> {
175+
fn clone(&self) -> Self {
176+
Self { left: self.left, right: self.right, normal: self.normal.clone() }
177+
}
178+
}
179+
141180
/// The codec used internally to encode and decode nodes.
142181
pub struct NodeCodec<D>(D);
143182

@@ -154,8 +193,8 @@ impl<'a, D: Distance> BytesEncode<'a> for NodeCodec<D> {
154193
}
155194
Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => {
156195
bytes.push(SPLIT_PLANE_NORMAL_TAG);
157-
bytes.extend_from_slice(&left.to_bytes());
158-
bytes.extend_from_slice(&right.to_bytes());
196+
bytes.extend_from_slice(&left.to_be_bytes());
197+
bytes.extend_from_slice(&right.to_be_bytes());
159198
if let Some(normal) = normal {
160199
bytes.extend_from_slice(normal.as_bytes());
161200
}
@@ -182,8 +221,10 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec<D> {
182221
Ok(Node::Leaf(Leaf { header, vector }))
183222
}
184223
[SPLIT_PLANE_NORMAL_TAG, bytes @ ..] => {
185-
let (left, bytes) = NodeId::from_bytes(bytes);
186-
let (right, bytes) = NodeId::from_bytes(bytes);
224+
let left = BigEndian::read_u32(bytes);
225+
let bytes = &bytes[std::mem::size_of_val(&left)..];
226+
let right = BigEndian::read_u32(bytes);
227+
let bytes = &bytes[std::mem::size_of_val(&right)..];
187228
let normal = if bytes.is_empty() {
188229
None
189230
} else {
@@ -194,7 +235,108 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec<D> {
194235
[DESCENDANTS_TAG, bytes @ ..] => Ok(Node::Descendants(Descendants {
195236
descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?),
196237
})),
197-
unknown => panic!("What the fuck is an {unknown:?}"),
238+
unknown => panic!(
239+
"Did not recognize node tag type: {unknown:?} while decoding a node from v0.7.0"
240+
),
241+
}
242+
}
243+
}
244+
245+
/// The codec used internally during read operations to decode nodes to a common interface from the v0.4.0.
246+
pub struct GenericReadNodeCodecFromV0_4_0<D>(D);
247+
248+
impl<'a, D: Distance> BytesDecode<'a> for GenericReadNodeCodecFromV0_4_0<D> {
249+
type DItem = GenericReadNode<'a, D>;
250+
251+
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
252+
match bytes {
253+
[LEAF_TAG, bytes @ ..] => {
254+
let (header_bytes, remaining) = bytes.split_at(size_of::<D::Header>());
255+
let header = pod_read_unaligned(header_bytes);
256+
let vector = UnalignedVector::<D::VectorCodec>::from_bytes(remaining)?;
257+
258+
Ok(GenericReadNode::Leaf(Leaf { header, vector }))
259+
}
260+
[SPLIT_PLANE_NORMAL_TAG, bytes @ ..] => {
261+
// From v0.4.0 to v0.5.0 included, the children were stored as `NodeId` and could point directly to items.
262+
let (left, bytes) = NodeId::from_bytes(bytes);
263+
let (right, bytes) = NodeId::from_bytes(bytes);
264+
// And the normal could not be null, but it could be a vector filled with zeros.
265+
let normal = UnalignedVector::<D::VectorCodec>::from_bytes(bytes)?;
266+
let normal = if normal.is_zero() {
267+
None
268+
} else {
269+
Some(normal)
270+
};
271+
Ok(GenericReadNode::SplitPlaneNormal(GenericReadSplitPlaneNormal { normal, left, right }))
272+
}
273+
[DESCENDANTS_TAG, bytes @ ..] => Ok(GenericReadNode::Descendants(Descendants {
274+
descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?),
275+
})),
276+
unknown => panic!("Did not recognize node tag type: {unknown:?} while decoding a generic read node from v0.4.0"),
277+
}
278+
}
279+
}
280+
281+
/// The codec used internally during read operations to decode nodes to a common interface from the v0.7.0.
282+
pub struct GenericReadNodeCodecFromV0_7_0<D>(D);
283+
284+
impl<'a, D: Distance> BytesDecode<'a> for GenericReadNodeCodecFromV0_7_0<D> {
285+
type DItem = GenericReadNode<'a, D>;
286+
287+
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
288+
NodeCodec::bytes_decode(bytes).map(|node| match node {
289+
Node::SplitPlaneNormal(split_plane_normal) => {
290+
GenericReadNode::SplitPlaneNormal(GenericReadSplitPlaneNormal {
291+
// From v0.6.0 the split plane normal always points to a tree node.
292+
left: NodeId::tree(split_plane_normal.left),
293+
right: NodeId::tree(split_plane_normal.right),
294+
normal: split_plane_normal.normal,
295+
})
296+
}
297+
Node::Descendants(descendants) => GenericReadNode::Descendants(descendants),
298+
Node::Leaf(leaf) => GenericReadNode::Leaf(leaf),
299+
})
300+
}
301+
}
302+
303+
/// The codec used internally during read operations to decode nodes to a common interface from the v0.4.0.
304+
pub struct WriteNodeCodecForV0_5_0<D>(D);
305+
306+
impl<'a, D: Distance> BytesEncode<'a> for WriteNodeCodecForV0_5_0<D> {
307+
// Since the dimension of the vector has been lost while converting to a generic node, we need to get it back.
308+
type EItem = (GenericReadNode<'a, D>, usize);
309+
310+
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
311+
// It's ok to clone and be slow because that only happens once when upgrading from v0.4.0 to v0.5.0.
312+
match &item.0 {
313+
// The leaf didn't change between v0.4.0 and today.
314+
GenericReadNode::Leaf(leaf) => {
315+
Ok(NodeCodec::bytes_encode(&Node::Leaf(leaf.clone()))?.into_owned().into())
316+
}
317+
// The descendants didn't change between v0.4.0 and today.
318+
GenericReadNode::Descendants(descendants) => {
319+
Ok(NodeCodec::bytes_encode(&Node::<D>::Descendants(descendants.clone()))?
320+
.into_owned()
321+
.into())
322+
}
323+
GenericReadNode::SplitPlaneNormal(GenericReadSplitPlaneNormal {
324+
left,
325+
right,
326+
normal,
327+
}) => {
328+
// Original code at: https://github.com/meilisearch/arroy/blob/5b748bac2c69c65a97980901b02067a3a545e357/src/node.rs#L152-L157
329+
let mut bytes = Vec::new();
330+
bytes.push(SPLIT_PLANE_NORMAL_TAG);
331+
bytes.extend_from_slice(&left.to_bytes());
332+
bytes.extend_from_slice(&right.to_bytes());
333+
match normal {
334+
Some(normal) => bytes.extend_from_slice(normal.as_bytes()),
335+
// If the normal is None, we need to write a vector filled with zeros.
336+
None => bytes.extend_from_slice(&vec![0; item.1]),
337+
}
338+
Ok(Cow::Owned(bytes))
339+
}
198340
}
199341
}
200342
}

src/parallel.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ use roaring::{RoaringBitmap, RoaringTreemap};
1616
use crate::internals::{KeyCodec, Leaf, NodeCodec};
1717
use crate::key::{Key, Prefix, PrefixCodec};
1818
use crate::node::{Node, SplitPlaneNormal};
19-
use crate::node_id::NodeMode;
2019
use crate::{Database, Distance, Error, ItemId, Result};
2120

2221
/// A structure to store the tree nodes out of the heed database.
@@ -489,13 +488,8 @@ impl<'t, D: Distance> ImmutableTrees<'t, D> {
489488
}
490489
Node::SplitPlaneNormal(SplitPlaneNormal { left, right, normal: _ }) => {
491490
trees.insert(current, (bytes.len(), bytes.as_ptr()));
492-
// We must avoid the items and only push the tree nodes
493-
if left.mode == NodeMode::Tree {
494-
explore.push(left.item);
495-
}
496-
if right.mode == NodeMode::Tree {
497-
explore.push(right.item);
498-
}
491+
explore.push(left);
492+
explore.push(right);
499493
}
500494
}
501495
}

0 commit comments

Comments
 (0)