Skip to content

Commit 0ea5324

Browse files
fix: allow the previous graph to connect with the new one (#113)
* fix: allow the previous graph to connect with the new one * make biffer * test: enable e2e fuzzing * Change the fuzzer to run for a given time * Add a fuzzer job in the CI --------- Co-authored-by: Kerollmops <clement@meilisearch.com>
1 parent 3085bcc commit 0ea5324

File tree

4 files changed

+104
-36
lines changed

4 files changed

+104
-36
lines changed

.github/workflows/rust.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,13 @@ jobs:
5656
if: always()
5757
with:
5858
command: doc
59+
60+
fuzzer:
61+
runs-on: ubuntu-latest
62+
steps:
63+
- uses: actions/checkout@v1
64+
- uses: dtolnay/rust-toolchain@1.85
65+
- name: Run fuzzer
66+
env:
67+
HANNOY_FUZZ_DURATION_SEC: 1800
68+
run: cargo test -- random_read_writes

benches/benchmark.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ mod hnsw {
2929
}
3030

3131
fn create_db_and_fill_with_vecs<const DIM: usize>(
32-
env: &Env,
32+
env: &'_ Env,
3333
size: usize,
34-
) -> hannoy::Result<(Writer<Cosine>, RwTxn, Database<Cosine>)> {
34+
) -> hannoy::Result<(Writer<Cosine>, RwTxn<'_>, Database<Cosine>)> {
3535
let mut wtxn = env.write_txn().unwrap();
3636

3737
let db: Database<Cosine> = env.create_database(&mut wtxn, None)?;

src/hnsw.rs

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,13 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
185185
Ok(()) as Result<(), Error>
186186
})?;
187187

188-
self.maybe_patch_old_links(&lmdb, to_delete, options)?;
188+
self.fill_gaps_from_deleted(&lmdb, to_delete, options)?;
189189

190190
// Single-threaded write to lmdb
191191
options.progress.update(HannoyBuild::WritingTheItems);
192192
let mut cancellation_index = 0;
193193

194-
for lvl in 0..=self.max_level {
195-
let Some(map) = self.layers.get(lvl) else { break };
194+
for (lvl, map) in self.layers.iter().enumerate() {
196195
let map_guard = map.pin();
197196

198197
for (item_id, node_state) in &map_guard {
@@ -332,7 +331,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
332331
/// the end of indexing we need to merge the old and new links and prune ones pointing to
333332
/// deleted items.
334333
/// Algorithm 4 from FreshDiskANN paper.
335-
fn maybe_patch_old_links<P>(
334+
fn fill_gaps_from_deleted<P>(
336335
&mut self,
337336
lmdb: &FrozenReader<D>,
338337
to_delete: &RoaringBitmap,
@@ -387,6 +386,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
387386
}
388387
bitmap |= links;
389388
bitmap -= to_delete;
389+
debug_assert!(bitmap.is_disjoint(to_delete));
390390

391391
// Case 1: Union of [on_disk, current_build, deleted_extension] is small enough
392392
let thresh = if lvl == 0 { M0 } else { M };
@@ -404,10 +404,8 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
404404
let curr = &lmdb.get_item(id)?;
405405

406406
for other in bitmap {
407-
if let Ok(v) = lmdb.get_item(other) {
408-
let dist = D::distance(curr, &v);
409-
new_links.push((OrderedFloat(dist), other));
410-
}
407+
let dist = D::distance(curr, &lmdb.get_item(other)?);
408+
new_links.push((OrderedFloat(dist), other));
411409
}
412410
let pruned = self.robust_prune(new_links, lvl, self.alpha, lmdb)?;
413411
let _ = map_guard.insert(id, NodeState { links: ArrayVec::from_iter(pruned) });
@@ -451,7 +449,7 @@ impl<'a, D: Distance, const M: usize, const M0: usize> HnswBuilder<'a, D, M, M0>
451449
Some(node_state) => res.extend(node_state.links.iter().map(|(_, i)| *i)),
452450
None => {
453451
// lazily add this entry so he can get updated later
454-
// pinned.insert(item_id, NodeState { links: array_vec![] });
452+
pinned.insert(item_id, NodeState { links: array_vec![] });
455453
}
456454
}
457455

src/tests/fuzz.rs

Lines changed: 85 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,30 @@
1+
use std::{
2+
env::VarError,
3+
time::{Duration, Instant},
4+
};
5+
16
use crate::{
27
distance::Cosine,
8+
key::{KeyCodec, Prefix, PrefixCodec},
9+
node::{Links, Node, NodeCodec},
10+
node_id::NodeMode,
311
tests::{create_database_indices_with_items, DatabaseHandle},
412
Database, Reader, Writer,
513
};
614
use arbitrary::{Arbitrary, Unstructured};
715
use heed::RoTxn;
8-
use rand::{self, rngs::StdRng, Rng, SeedableRng};
16+
use rand::{
17+
self,
18+
distributions::Uniform,
19+
rngs::{StdRng, ThreadRng},
20+
Rng, SeedableRng,
21+
};
922
use roaring::RoaringBitmap;
1023
use tracing::info;
1124

12-
#[derive(Debug)]
13-
struct Item<const M: usize> {
14-
id: u32,
15-
data: [f32; M],
16-
}
17-
18-
impl<'a, const M: usize> Arbitrary<'a> for Item<M> {
19-
fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
20-
let data: [f32; M] = u.arbitrary()?;
21-
let id: u32 = u.arbitrary()?;
22-
23-
Ok(Item { data, id })
24-
}
25-
}
26-
2725
#[derive(Arbitrary, Debug)]
2826
enum WriteOp<const M: usize> {
29-
Add(Item<M>),
27+
Add(u32),
3028
Del(u32),
3129
}
3230

@@ -38,44 +36,106 @@ fn assert_all_readable<const DIM: usize>(rtxn: &RoTxn, database: Database<Cosine
3836
assert_eq!(&RoaringBitmap::from_iter(found.into_iter().map(|(id, _)| id)), reader.item_ids())
3937
}
4038

39+
fn assert_deleted_items_are_gone(
40+
rtxn: &RoTxn,
41+
database: Database<Cosine>,
42+
deleted: &RoaringBitmap,
43+
) {
44+
// assert the reader cannot find any deleted vectors
45+
let reader = Reader::<Cosine>::open(&rtxn, 0, database).unwrap();
46+
let item_intersection = deleted & reader.item_ids();
47+
assert!(item_intersection.is_empty(), "{:?} should be deleted!", item_intersection);
48+
49+
// now iter over ALL links and assert no connection exists to a deleted item
50+
let mut cursor = database
51+
.remap_types::<PrefixCodec, NodeCodec<Cosine>>()
52+
.prefix_iter(rtxn, &Prefix::links(0))
53+
.unwrap()
54+
.remap_key_type::<KeyCodec>();
55+
56+
while let Some((key, node)) = cursor.next().transpose().unwrap() {
57+
assert!(
58+
!deleted.contains(key.node.item),
59+
"the item and its data should be deleted!\n{:?}",
60+
&key
61+
);
62+
63+
match key.node.mode {
64+
NodeMode::Links => {
65+
if let Node::Links(Links { links: links_bitmap }) = node {
66+
let link_intersection = deleted & links_bitmap.as_ref();
67+
assert!(
68+
link_intersection.is_empty(),
69+
"LINKS VIOLATION: {:?} should be empty",
70+
link_intersection
71+
);
72+
}
73+
}
74+
_ => continue,
75+
}
76+
}
77+
}
78+
4179
#[test]
42-
#[ignore = "if working properly this should run infinitely"]
4380
fn random_read_writes() {
4481
let seed: u64 = rand::random();
4582
let mut rng = StdRng::seed_from_u64(seed);
4683

47-
const DIM: usize = 768;
84+
const DIM: usize = 32;
4885
const NUMEL: usize = 1000;
4986
const M: usize = 16;
50-
const M0: usize = 32;
87+
const M0: usize = 768;
5188

5289
let DatabaseHandle { env, database, tempdir: _ } =
5390
create_database_indices_with_items::<Cosine, DIM, M, M0, _>(0..1, NUMEL, &mut rng);
5491

55-
for _ in 0.. {
92+
let mut deleted = RoaringBitmap::new();
93+
let mut vec_rng = rand::thread_rng();
94+
95+
// util for generating new vectors on the fly
96+
fn gen_vec(rng: &mut ThreadRng) -> [f32; DIM] {
97+
let unif = Uniform::new(-1.0, 1.0);
98+
std::array::from_fn(|_| rng.sample(unif))
99+
}
100+
101+
let duration = match std::env::var("HANNOY_FUZZ_DURATION_SEC") {
102+
Ok(value) => Duration::from_secs(value.parse().expect("valid number of seconds")),
103+
Err(VarError::NotPresent) => Duration::from_secs(20),
104+
Err(VarError::NotUnicode(e)) => panic!("Invalid duration: {e:?}"),
105+
};
106+
107+
let before = Instant::now();
108+
while before.elapsed() < duration {
56109
let rtxn = env.read_txn().unwrap();
57110
assert_all_readable::<DIM>(&rtxn, database);
111+
assert_deleted_items_are_gone(&rtxn, database, &deleted);
112+
deleted.clear();
58113

59114
// get batch of write operations and apply them
60115
info!("WRITING");
61116
let mut data = [0_u8; 1024 * 1024 * 1];
62117
rng.fill(&mut data);
63118
let mut u = Unstructured::new(&data);
64-
let ops: Vec<WriteOp<DIM>> = (0..1000).map(|_| u.arbitrary().unwrap()).collect();
119+
let ops: Vec<WriteOp<DIM>> = (0..100).map(|_| u.arbitrary().unwrap()).collect();
65120

66121
let mut wtxn = env.write_txn().unwrap();
67122
let writer = Writer::new(database, 0, DIM);
68123

69124
for op in ops {
70125
match op {
71-
WriteOp::Add(item) => {
72-
let Item { data, id } = item;
126+
WriteOp::Add(id) => {
73127
let id = id % (NUMEL as u32);
74-
writer.add_item(&mut wtxn, id, &data).unwrap();
128+
let vector = gen_vec(&mut vec_rng);
129+
assert!(vector != [0.0f32; DIM]);
130+
writer.add_item(&mut wtxn, id, &vector).unwrap();
131+
132+
// ensure added random id isn't registered in deleted
133+
let _ = deleted.remove(id);
75134
}
76135
WriteOp::Del(id) => {
77136
let id = id % (NUMEL as u32);
78137
let _ = writer.del_item(&mut wtxn, id).unwrap();
138+
deleted.insert(id);
79139
}
80140
}
81141
}

0 commit comments

Comments
 (0)