diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 654d4dc..e50f36d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -2,9 +2,9 @@ name: Rust CI on: push: - branches: ["main"] + branches: ["main", "release-v0.6"] pull_request: - branches: ["main"] + branches: ["main", "release-v0.6"] env: CARGO_TERM_COLOR: always @@ -17,14 +17,14 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-latest-xlarge, windows-latest] + os: [ubuntu-latest, macos-latest, windows-latest] rust: - stable - beta steps: - uses: actions/checkout@v1 - - uses: dtolnay/rust-toolchain@1.81 + - uses: dtolnay/rust-toolchain@1.85 - uses: actions-rs/cargo@v1 with: command: build @@ -41,7 +41,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - - uses: dtolnay/rust-toolchain@1.81 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal components: clippy, rustfmt diff --git a/Cargo.toml b/Cargo.toml index 209d24b..19aa205 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "arroy" description = "Annoy-inspired Approximate Nearest Neighbors in Rust, based on LMDB and optimized for memory usage" -version = "0.6.1" +version = "0.6.3" documentation = "https://docs.rs/arroy" repository = "https://github.com/meilisearch/arroy" keywords = ["ANN-search", "Graph-algorithms", "Vector-Search", "Store"] @@ -31,9 +31,11 @@ enum-iterator = "2.1.0" [dev-dependencies] anyhow = "1.0.95" +approx = "0.5.1" arbitrary = { version = "1.4.1", features = ["derive"] } clap = { version = "4.5.24", features = ["derive"] } env_logger = "0.11.6" +hannoy = "0.0.4" insta = "1.42.0" instant-distance = "0.6.1" proptest = "1.6.0" diff --git a/src/node.rs b/src/node.rs index 9ae866a..c01abcb 100644 --- a/src/node.rs +++ b/src/node.rs @@ -188,7 +188,24 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec { [DESCENDANTS_TAG, bytes @ ..] => Ok(Node::Descendants(Descendants { descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?), })), - unknown => panic!("What the fuck is an {unknown:?}"), + [unknown_tag, ..] => { + Err(Box::new(InvalidNodeDecoding { unknown_tag: Some(*unknown_tag) })) + } + [] => Err(Box::new(InvalidNodeDecoding { unknown_tag: None })), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub struct InvalidNodeDecoding { + unknown_tag: Option, +} + +impl fmt::Display for InvalidNodeDecoding { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.unknown_tag { + Some(unknown_tag) => write!(f, "Invalid node decoding: unknown tag {unknown_tag}"), + None => write!(f, "Invalid node decoding: empty array of bytes"), } } } diff --git a/src/reader.rs b/src/reader.rs index 7b09db6..fb1a275 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -286,53 +286,73 @@ impl<'t, D: Distance> Reader<'t, D> { if self.items.is_empty() { return Ok(Vec::new()); } - // Since the datastructure describes a kind of btree, the capacity is something in the order of: - // The number of root nodes + log2 of the total number of vectors. - let mut queue = - BinaryHeap::with_capacity(self.roots.len() + self.items.len().ilog2() as usize); - let search_k = opt.search_k.map_or(opt.count * self.roots.len(), NonZeroUsize::get); - let search_k = opt - .oversampling - .map_or(search_k.saturating_mul(D::DEFAULT_OVERSAMPLING), |oversampling| { - search_k.saturating_mul(oversampling.get()) - }); - - // Insert all the root nodes and associate them to the highest distance. - queue.extend(repeat(OrderedFloat(f32::INFINITY)).zip(self.roots.iter().map(NodeId::tree))); + let candidates = opt.candidates.map(|candidates| candidates & &self.items); - let mut nns = Vec::new(); - while nns.len() < search_k { - let (OrderedFloat(dist), item) = match queue.pop() { - Some(out) => out, - None => break, - }; - - let key = Key::new(self.index, item); - match self.database.get(rtxn, &key)?.ok_or(Error::missing_key(key))? { - Node::Leaf(_) => { - if opt.candidates.map_or(true, |c| c.contains(item.item)) { - nns.push(item.unwrap_item()); - } - } - Node::Descendants(Descendants { descendants }) => { - if let Some(candidates) = opt.candidates { - nns.extend((descendants.into_owned() & candidates).iter()); - } else { - nns.extend(descendants.iter()); + let nns = match candidates { + // When we're filtering on 5% or less of the database we don't use the trees and + // just sort every candidates by hand + Some(candidates) if (candidates.len() as f32 / self.items.len() as f32) < 0.5 => { + candidates.iter().collect() + } + _ => { + // Since the datastructure describes a kind of btree, the capacity is something in the order of: + // The number of root nodes + log2 of the total number of vectors. + let mut queue = + BinaryHeap::with_capacity(self.roots.len() + self.items.len().ilog2() as usize); + let search_k = opt.search_k.map_or(opt.count * self.roots.len(), NonZeroUsize::get); + let search_k = opt + .oversampling + .map_or(search_k.saturating_mul(D::DEFAULT_OVERSAMPLING), |oversampling| { + search_k.saturating_mul(oversampling.get()) + }); + + // Insert all the root nodes and associate them to the highest distance. + queue.extend( + repeat(OrderedFloat(f32::INFINITY)).zip(self.roots.iter().map(NodeId::tree)), + ); + + let mut nns = Vec::new(); + while nns.len() < search_k { + let (OrderedFloat(dist), item) = match queue.pop() { + Some(out) => out, + None => break, + }; + + let key = Key::new(self.index, item); + match self.database.get(rtxn, &key)?.ok_or(Error::missing_key(key))? { + Node::Leaf(_) => { + if opt.candidates.is_none_or(|c| c.contains(item.item)) { + nns.push(item.unwrap_item()); + } + } + Node::Descendants(Descendants { descendants }) => { + if let Some(candidates) = opt.candidates { + nns.extend((descendants.into_owned() & candidates).iter()); + } else { + nns.extend(descendants.iter()); + } + } + Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => { + let margin = D::margin_no_header(&normal, &query_leaf.vector); + queue.push(( + OrderedFloat(D::pq_distance(dist, margin, Side::Left)), + left, + )); + queue.push(( + OrderedFloat(D::pq_distance(dist, margin, Side::Right)), + right, + )); + } } } - Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => { - let margin = D::margin_no_header(&normal, &query_leaf.vector); - queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Left)), left)); - queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Right)), right)); - } - } - } - // Get distances for all items - // To avoid calculating distance multiple times for any items, sort by id and dedup by id. - nns.sort_unstable(); - nns.dedup(); + // Get distances for all items + // To avoid calculating distance multiple times for any items, sort by id and dedup by id. + nns.sort_unstable(); + nns.dedup(); + nns + } + }; let mut nns_distances = Vec::with_capacity(nns.len()); for nn in nns { diff --git a/src/tests/binary_quantized.rs b/src/tests/binary_quantized.rs index 068b4ca..10f41ee 100644 --- a/src/tests/binary_quantized.rs +++ b/src/tests/binary_quantized.rs @@ -48,7 +48,7 @@ fn write_and_retrieve_binary_quantized_vector() { ================== Dumping index 0 Root: Metadata { dimensions: 16, items: RoaringBitmap<[0]>, roots: [0], distance: "binary quantized euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderBinaryQuantizedEuclidean { bias: 0.0 }, vector: [-1.0000, -1.0000, 1.0000, -1.0000, 1.0000, 1.0000, -1.0000, 1.0000, -1.0000, -1.0000, "other ..."] }) "###); diff --git a/src/tests/writer.rs b/src/tests/writer.rs index 8fd3404..56b3ba6 100644 --- a/src/tests/writer.rs +++ b/src/tests/writer.rs @@ -39,6 +39,67 @@ fn clear_small_database() { wtxn.commit().unwrap(); } +#[test] +fn convert_from_arroy_to_hannoy() { + // let handle = create_database::(); + let _ = rayon::ThreadPoolBuilder::new().num_threads(1).build_global(); + let dir = tempfile::tempdir().unwrap(); + let env = unsafe { heed::EnvOpenOptions::new().map_size(200 * 1024 * 1024).open(dir.path()) } + .unwrap(); + let mut wtxn = env.write_txn().unwrap(); + let database: hannoy::Database = + env.create_database(&mut wtxn, None).unwrap(); + wtxn.commit().unwrap(); + + let mut rng = rng(); + let mut wtxn = env.write_txn().unwrap(); + + let mut db_indexes: Vec = (0..10).collect(); + db_indexes.shuffle(&mut rng); + + for index in db_indexes.iter().copied() { + let writer = hannoy::Writer::new(database, index, 1024); + + // We're going to write 100 vectors per index + for i in 0..100 { + let vector: [f32; 1024] = std::array::from_fn(|_| rng.gen()); + writer.add_item(&mut wtxn, i, &vector).unwrap(); + } + writer.builder(&mut rng).build::<16, 32>(&mut wtxn).unwrap(); + } + wtxn.commit().unwrap(); + + // Now it's time to convert the indexes + + let mut wtxn = env.write_txn().unwrap(); + let rtxn = env.read_txn().unwrap(); + let database: crate::Database = env.open_database(&mut wtxn, None).unwrap().unwrap(); + + db_indexes.shuffle(&mut rng); + + for index in db_indexes { + let pre_commit_hannoy_reader = + hannoy::Reader::::open(&rtxn, index, database.remap_types()) + .unwrap(); + + let writer = Writer::new(database, index, pre_commit_hannoy_reader.dimensions()); + let mut builder = writer.builder(&mut rng); + builder.prepare_hannoy_conversion(&mut wtxn).unwrap(); + assert!(writer.need_build(&mut wtxn).unwrap()); + builder.build(&mut wtxn).unwrap(); + + for result in pre_commit_hannoy_reader.iter(&rtxn).unwrap() { + let (item_id, vector) = result.unwrap(); + let reader = Reader::open(&wtxn, index, database).unwrap(); + assert_eq!(reader.item_vector(&wtxn, item_id).unwrap().as_deref(), Some(&vector[..])); + let mut found = reader.nns(1).by_vector(&wtxn, &vector).unwrap(); + let (found_item_id, found_distance) = found.pop().unwrap(); + assert_eq!(found_item_id, item_id); + approx::assert_abs_diff_eq!(found_distance, 0.0); + } + } +} + #[test] fn use_u32_max_minus_one_for_a_vec() { let handle = create_database::(); @@ -53,7 +114,7 @@ fn use_u32_max_minus_one_for_a_vec() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[4294967294]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [4294967294] }) Item 4294967294: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) "###); @@ -73,7 +134,7 @@ fn use_u32_max_for_a_vec() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[4294967295]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [4294967295] }) Item 4294967295: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) "###); @@ -93,7 +154,7 @@ fn write_one_vector() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) "###); @@ -113,7 +174,7 @@ fn write_one_vector_in_one_tree() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) "###); @@ -133,7 +194,7 @@ fn write_one_vector_in_multiple_trees() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) "###); @@ -157,7 +218,7 @@ fn write_vectors_until_there_is_a_descendants() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0, 1, 2]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0, 1, 2] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000, 0.0000] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 1.0000, 1.0000] }) @@ -235,31 +296,31 @@ fn write_multiple_indexes() { ================== Dumping index 0 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) ================== Dumping index 1 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) ================== Dumping index 2 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) ================== Dumping index 3 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) ================== Dumping index 4 Root: Metadata { dimensions: 3, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 1.0000, 2.0000] }) "###); @@ -359,7 +420,7 @@ fn delete_one_item_in_a_one_item_db() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) "###); @@ -376,7 +437,7 @@ fn delete_one_item_in_a_one_item_db() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[]>, roots: [], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } "###); let rtxn = handle.env.read_txn().unwrap(); @@ -402,7 +463,7 @@ fn delete_document_in_an_empty_index_74() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) "###); @@ -428,11 +489,11 @@ fn delete_document_in_an_empty_index_74() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[]>, roots: [], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } ================== Dumping index 1 Root: Metadata { dimensions: 2, items: RoaringBitmap<[]>, roots: [], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } "###); let rtxn = handle.env.read_txn().unwrap(); @@ -458,7 +519,7 @@ fn delete_one_item_in_a_descendant() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0, 1] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] }) @@ -476,7 +537,7 @@ fn delete_one_item_in_a_descendant() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[1]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [1] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] }) "###); @@ -520,7 +581,7 @@ fn delete_one_leaf_in_a_split() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[1, 2]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [1, 2] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] }) Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] }) @@ -543,7 +604,7 @@ fn delete_one_item_in_a_single_document_database() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0]>, roots: [0], distance: "cosine" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderCosine { norm: 0.0 }, vector: [0.0000, 0.0000] }) "###); @@ -560,7 +621,7 @@ fn delete_one_item_in_a_single_document_database() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[]>, roots: [], distance: "cosine" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } "###); } @@ -654,7 +715,7 @@ fn add_one_item_incrementally_in_an_empty_db() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[]>, roots: [], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } "###); let mut wtxn = handle.env.write_txn().unwrap(); @@ -667,7 +728,7 @@ fn add_one_item_incrementally_in_an_empty_db() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) "###); @@ -687,7 +748,7 @@ fn add_one_item_incrementally_in_a_one_item_db() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) "###); @@ -702,7 +763,7 @@ fn add_one_item_incrementally_in_a_one_item_db() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0, 1] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] }) @@ -724,7 +785,7 @@ fn add_one_item_incrementally_to_create_a_split_node() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0, 1] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] }) @@ -740,7 +801,7 @@ fn add_one_item_incrementally_to_create_a_split_node() { ================== Dumping index 0 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2]>, roots: [2], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 1: Descendants(Descendants { descendants: [1, 2] }) Tree 2: SplitPlaneNormal(SplitPlaneNormal { left: Item(0), right: Tree(1), normal: [1.0000, 0.0000] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) @@ -1051,7 +1112,7 @@ fn append() { ================== Dumping index 1 Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1]>, roots: [0], distance: "euclidean" } - Version: Version { major: 0, minor: 6, patch: 1 } + Version: Version { major: 0, minor: 6, patch: 3 } Tree 0: Descendants(Descendants { descendants: [0, 1] }) Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] }) Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.1000, 0.1000] }) diff --git a/src/writer.rs b/src/writer.rs index 3d5b8aa..d8070ca 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use heed::types::{Bytes, DecodeIgnore, Unit}; use heed::{MdbError, PutFlags, RoTxn, RwTxn}; use rand::{Rng, SeedableRng}; -use rayon::iter::repeatn; +use rayon::iter::repeat_n; use rayon::prelude::*; use roaring::RoaringBitmap; @@ -81,6 +81,7 @@ pub enum MainStep { WritingNodesToDatabase, DeleteExtraneousTrees, WriteTheMetadata, + ConvertingHannoyToArroy, } /// The options available when building the arroy database. @@ -243,6 +244,11 @@ impl<'a, D: Distance, R: Rng + SeedableRng> ArroyBuilder<'a, D, R> { pub fn build(&mut self, wtxn: &mut RwTxn) -> Result<()> { self.writer.build(wtxn, self.rng, &self.inner) } + + /// Prepares the conversion from an hannoy database into an arroy one. + pub fn prepare_hannoy_conversion(&self, wtxn: &mut RwTxn) -> Result<()> { + self.writer.prepare_hannoy_conversion(wtxn, &self.inner) + } } /// A writer to store new items, remove existing ones, @@ -264,6 +270,60 @@ impl Writer { Writer { database, index, dimensions, tmpdir: None } } + /// After opening an hannoy database this function will prepare it for conversion, + /// cleanup the hannoy database and only keep the items/vectors entries. + fn prepare_hannoy_conversion(&self, wtxn: &mut RwTxn, options: &BuildOption) -> Result<()> { + tracing::debug!("Preparing dumpless upgrade from hannoy to arroy"); + (options.progress)(WriterProgress { main: MainStep::PreProcessingTheItems, sub: None }); + + let mut iter = self + .database + .remap_key_type::() + .prefix_iter_mut(wtxn, &Prefix::all(self.index))? + .remap_key_type::(); + + let mut new_items = RoaringBitmap::new(); + while let Some(result) = iter.next() { + match result { + Ok(( + Key { index: _, node: NodeId { mode: NodeMode::Item, item, .. }, .. }, + Node::Leaf(Leaf { header: _, vector }), + )) => { + // We only take care of the entries that can be decoded as Node Items (vectors) and + // mark them as newly inserted so the Writer::build method can compute the links for them. + new_items.insert(item); + if vector.len() != self.dimensions { + return Err(Error::InvalidVecDimension { + expected: self.dimensions, + received: vector.len(), + }); + } + } + Ok((Key { .. }, _)) | Err(heed::Error::Decoding(_)) => unsafe { + // Every other entry that fails to decode can be considered as something + // else than an item, is useless for the conversion and is deleted. + iter.del_current()?; + }, + // If there is another error (lmdb...), it is returned. + Err(e) => return Err(e.into()), + } + } + + drop(iter); + + // We mark all the items as updated so + // the Writer::build method can handle them. + for item in new_items { + self.database.remap_data_type::().put( + wtxn, + &Key::updated(self.index, item), + &(), + )?; + } + + Ok(()) + } + /// Returns a writer after having deleted the tree nodes and rewrote all the items /// for the new [`Distance`] format to be able to modify items safely. pub fn prepare_changing_distance(self, wtxn: &mut RwTxn) -> Result> { @@ -756,7 +816,7 @@ impl Writer { ) -> Result<(Vec, Vec)> { let roots: Vec<_> = metadata.roots.iter().collect(); - repeatn(rng.next_u64(), metadata.roots.len()) + repeat_n(rng.next_u64(), metadata.roots.len()) .zip(roots) .map(|(seed, root)| { tracing::debug!("started updating tree {root:X}..."); @@ -982,7 +1042,7 @@ impl Writer { let n_items = item_indices.len(); let concurrent_node_ids = frozen_reader.concurrent_node_ids; - repeatn(rng.next_u64(), n_trees.unwrap_or(usize::MAX)) + repeat_n(rng.next_u64(), n_trees.unwrap_or(usize::MAX)) .enumerate() // Stop generating trees once the specified number of tree nodes are generated // but continue to generate trees if the number of trees is unspecified