Skip to content

Commit 3085bcc

Browse files
feat: add builder method to trigger full rebuild (#114)
* feat: add builder method to trigger full rebuild * Update src/writer.rs Co-authored-by: Clément Renault <clement@meilisearch.com> --------- Co-authored-by: Clément Renault <clement@meilisearch.com>
1 parent 895cb56 commit 3085bcc

File tree

2 files changed

+92
-3
lines changed

2 files changed

+92
-3
lines changed

src/tests/writer.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use super::{create_database, rng};
1010
use crate::distance::{BinaryQuantizedCosine, Cosine, Euclidean};
1111
use crate::key::{KeyCodec, Prefix, PrefixCodec};
1212
use crate::reader::get_item;
13-
use crate::tests::DatabaseHandle;
13+
use crate::tests::{create_database_indices_with_items, DatabaseHandle};
1414
use crate::{Reader, Writer};
1515

1616
const M: usize = 3;
@@ -745,3 +745,27 @@ proptest! {
745745
writer.builder(&mut rng).build::<M, M0>(&mut wtxn).unwrap();
746746
}
747747
}
748+
749+
#[test]
750+
fn test_force_rebuild_and_search() {
751+
const DIM: usize = 768;
752+
let mut rng = rng();
753+
754+
// build the db
755+
let DatabaseHandle { env, database, tempdir: _ } =
756+
create_database_indices_with_items::<Cosine, DIM, M, M0, _>(0..1, 100, &mut rng);
757+
758+
// force rebuild the db
759+
let mut wtxn = env.write_txn().unwrap();
760+
let writer = Writer::new(database, 0, DIM);
761+
writer.builder(&mut rng).force_rebuild::<M, M0>(&mut wtxn).unwrap();
762+
wtxn.commit().unwrap();
763+
764+
// check we can still read over it
765+
let rtxn = env.read_txn().unwrap();
766+
let reader = crate::Reader::<Cosine>::open(&rtxn, 0, database).unwrap();
767+
768+
let found = reader.nns(10).by_item(&rtxn, 0).unwrap().unwrap().into_nns();
769+
assert!(found.len() == 10);
770+
assert!(!found.contains(&(0, 0.0)))
771+
}

src/writer.rs

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,37 @@ impl<'a, D: Distance, R: Rng + SeedableRng, P> HannoyBuilder<'a, D, R, P> {
199199
self.writer.build::<R, P, M, M0>(wtxn, self.rng, &self.inner)
200200
}
201201

202+
/// Rebuilds an HNSW graph from scratch.
203+
///
204+
/// Assumes you've previously built one or more times. This function will drop all graph edges
205+
/// from previous builds and reconstruct the hnsw with the vectors found in the db.
206+
///
207+
/// Standard builds work by first adding or deleting some nodes, here we're marking all
208+
/// vectors found on disk as updated to force a rebuild. When in doubt prefer [`Self::build<M,M0>`] over
209+
/// this method.
210+
///
211+
/// # Example
212+
///
213+
/// ```no_run
214+
/// # use hannoy::{Writer, distances::Euclidean};
215+
/// # let (writer, wtxn): (Writer<Euclidean>, heed::RwTxn) = todo!();
216+
/// use rayon;
217+
/// use rand::rngs::StdRng;
218+
/// use rand::SeedableRng;
219+
///
220+
/// // configure global threadpool if you want!
221+
/// rayon::ThreadPoolBuilder::new().num_threads(4).build_global().unwrap();
222+
///
223+
/// let mut rng = StdRng::seed_from_u64(4729);
224+
/// writer.builder(&mut rng).force_rebuild::<16,32>(&mut wtxn);
225+
/// ```
226+
pub fn force_rebuild<const M: usize, const M0: usize>(&mut self, wtxn: &mut RwTxn) -> Result<()>
227+
where
228+
P: steppe::Progress,
229+
{
230+
self.writer.force_rebuild::<R, P, M, M0>(wtxn, self.rng, &self.inner)
231+
}
232+
202233
/// Converts an arroy db into a hannoy one.
203234
#[cfg(any(test, feature = "arroy"))]
204235
#[cfg_attr(docsrs, doc(cfg(feature = "arroy")))]
@@ -493,7 +524,7 @@ impl<D: Distance> Writer<D> {
493524
// Remove deleted links from lmdb AFTER build; in DiskANN we use a deleted item's
494525
// neighbours when filling in the "gaps" left in the graph from deletions. See
495526
// [`HnswBuilder::maybe_patch_old_links`] for more details.
496-
self.delete_links_from_db(to_delete, wtxn)?;
527+
self.delete_links_from_db(&to_delete, wtxn)?;
497528

498529
debug!("write the metadata...");
499530
options.progress.update(HannoyBuild::WriteTheMetadata);
@@ -519,6 +550,40 @@ impl<D: Distance> Writer<D> {
519550
Ok(())
520551
}
521552

553+
/// Kinda like clear and create, but only for links
554+
fn force_rebuild<R, P, const M: usize, const M0: usize>(
555+
&self,
556+
wtxn: &mut RwTxn,
557+
rng: &mut R,
558+
options: &BuildOption<P>,
559+
) -> Result<()>
560+
where
561+
R: Rng + SeedableRng,
562+
P: steppe::Progress,
563+
{
564+
// 1. delete metadata
565+
self.database.delete(wtxn, &Key::metadata(self.index))?;
566+
567+
// 2. delete version
568+
self.database.delete(wtxn, &Key::version(self.index))?;
569+
570+
// 3. delete all links
571+
let item_ids = self.item_indices(wtxn, options)?;
572+
self.delete_links_from_db(&item_ids, wtxn)?;
573+
574+
// 4. mark all nodes as updated
575+
for item_id in item_ids {
576+
self.database.remap_data_type::<Unit>().put(
577+
wtxn,
578+
&Key::updated(self.index, item_id),
579+
&(),
580+
)?;
581+
}
582+
583+
// 5. trigger build
584+
self.build::<R, P, M, M0>(wtxn, rng, options)
585+
}
586+
522587
fn reset_and_retrieve_updated_items<P>(
523588
&self,
524589
wtxn: &mut RwTxn,
@@ -586,7 +651,7 @@ impl<D: Distance> Writer<D> {
586651

587652
// Iterates over links in lmdb and deletes those in `to_delete`. There can be several links
588653
// with the same NodeId.item, each differing by their layer
589-
fn delete_links_from_db(&self, to_delete: RoaringBitmap, wtxn: &mut RwTxn) -> Result<()> {
654+
fn delete_links_from_db(&self, to_delete: &RoaringBitmap, wtxn: &mut RwTxn) -> Result<()> {
590655
let mut cursor = self
591656
.database
592657
.remap_key_type::<PrefixCodec>()

0 commit comments

Comments
 (0)