Skip to content

Commit 92c1ddd

Browse files
committed
final version of the nb tree guesser
1 parent 906f673 commit 92c1ddd

File tree

2 files changed

+47
-56
lines changed

2 files changed

+47
-56
lines changed

src/tests/writer.rs

Lines changed: 38 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -48,34 +48,34 @@ fn guess_right_number_of_tree_while_growing() {
4848
let quick_target = |dim, bitmap| target_n_trees(&BuildOption::default(), dim, bitmap, &[]);
4949

5050
assert_snapshot!(quick_target(768, &b1), @"1");
51-
assert_snapshot!(quick_target(768, &b10), @"10");
52-
assert_snapshot!(quick_target(768, &b100), @"60");
53-
assert_snapshot!(quick_target(768, &b1000), @"119");
51+
assert_snapshot!(quick_target(768, &b10), @"1");
52+
assert_snapshot!(quick_target(768, &b100), @"2");
53+
assert_snapshot!(quick_target(768, &b1000), @"16");
5454
assert_snapshot!(quick_target(768, &b10_000), @"237");
5555
assert_snapshot!(quick_target(768, &b100_000), @"473");
5656
assert_snapshot!(quick_target(768, &b1_000_000), @"946");
5757
assert_snapshot!(quick_target(768, &b10_000_000), @"1892");
5858
assert_snapshot!(quick_target(768, &b100_000_000), @"3784");
5959

6060
assert_snapshot!(quick_target(1512, &b1), @"1");
61-
assert_snapshot!(quick_target(1512, &b10), @"10");
62-
assert_snapshot!(quick_target(1512, &b100), @"73");
63-
assert_snapshot!(quick_target(1512, &b1000), @"145");
64-
assert_snapshot!(quick_target(1512, &b10_000), @"290");
65-
assert_snapshot!(quick_target(1512, &b100_000), @"580");
66-
assert_snapshot!(quick_target(1512, &b1_000_000), @"1160");
67-
assert_snapshot!(quick_target(1512, &b10_000_000), @"2320");
68-
assert_snapshot!(quick_target(1512, &b100_000_000), @"4639");
61+
assert_snapshot!(quick_target(1512, &b10), @"1");
62+
assert_snapshot!(quick_target(1512, &b100), @"2");
63+
assert_snapshot!(quick_target(1512, &b1000), @"16");
64+
assert_snapshot!(quick_target(1512, &b10_000), @"152");
65+
assert_snapshot!(quick_target(1512, &b100_000), @"304");
66+
assert_snapshot!(quick_target(1512, &b1_000_000), @"608");
67+
assert_snapshot!(quick_target(1512, &b10_000_000), @"1215");
68+
assert_snapshot!(quick_target(1512, &b100_000_000), @"2429");
6969

7070
assert_snapshot!(quick_target(3072, &b1), @"1");
71-
assert_snapshot!(quick_target(3072, &b10), @"10");
72-
assert_snapshot!(quick_target(3072, &b100), @"90");
73-
assert_snapshot!(quick_target(3072, &b1000), @"180");
74-
assert_snapshot!(quick_target(3072, &b10_000), @"359");
75-
assert_snapshot!(quick_target(3072, &b100_000), @"718");
76-
assert_snapshot!(quick_target(3072, &b1_000_000), @"1436");
77-
assert_snapshot!(quick_target(3072, &b10_000_000), @"2872");
78-
assert_snapshot!(quick_target(3072, &b100_000_000), @"5743");
71+
assert_snapshot!(quick_target(3072, &b10), @"1");
72+
assert_snapshot!(quick_target(3072, &b100), @"2");
73+
assert_snapshot!(quick_target(3072, &b1000), @"16");
74+
assert_snapshot!(quick_target(3072, &b10_000), @"180");
75+
assert_snapshot!(quick_target(3072, &b100_000), @"360");
76+
assert_snapshot!(quick_target(3072, &b1_000_000), @"720");
77+
assert_snapshot!(quick_target(3072, &b10_000_000), @"1440");
78+
assert_snapshot!(quick_target(3072, &b100_000_000), @"2879");
7979
}
8080

8181
#[ignore = "strange test"]
@@ -980,23 +980,11 @@ fn delete_extraneous_tree() {
980980
insta::assert_snapshot!(handle, @r#"
981981
==================
982982
Dumping index 0
983-
Root: Metadata { dimensions: 4, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0, 1, 2, 3, 4], distance: "euclidean" }
983+
Root: Metadata { dimensions: 4, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0], distance: "euclidean" }
984984
Version: Version { major: 0, minor: 7, patch: 0 }
985-
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 13, right: 14, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
986-
Tree 1: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 11, right: 12, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.2778" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
987-
Tree 2: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 9, right: 10, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.3125" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
988-
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 7, right: 8, normal: Leaf { header: NodeHeaderEuclidean { bias: "-1.8857" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
989-
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 5, right: 6, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.7500" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
990-
Tree 5: Descendants(Descendants { descendants: [2, 3, 4] })
991-
Tree 6: Descendants(Descendants { descendants: [0, 1] })
992-
Tree 7: Descendants(Descendants { descendants: [0, 1] })
993-
Tree 8: Descendants(Descendants { descendants: [2, 3, 4] })
994-
Tree 9: Descendants(Descendants { descendants: [0, 1, 2] })
995-
Tree 10: Descendants(Descendants { descendants: [3, 4] })
996-
Tree 11: Descendants(Descendants { descendants: [0, 1, 2] })
997-
Tree 12: Descendants(Descendants { descendants: [3, 4] })
998-
Tree 13: Descendants(Descendants { descendants: [2, 3, 4] })
999-
Tree 14: Descendants(Descendants { descendants: [0, 1] })
985+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 1, right: 2, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
986+
Tree 1: Descendants(Descendants { descendants: [2, 3, 4] })
987+
Tree 2: Descendants(Descendants { descendants: [0, 1] })
1000988
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [0.0000, 0.0000, 0.0000, 0.0000] })
1001989
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] })
1002990
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [2.0000, 0.0000, 0.0000, 0.0000] })
@@ -1012,14 +1000,16 @@ fn delete_extraneous_tree() {
10121000
insta::assert_snapshot!(handle, @r#"
10131001
==================
10141002
Dumping index 0
1015-
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [1, 2], distance: "euclidean" }
1003+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0, 3], distance: "euclidean" }
10161004
Version: Version { major: 0, minor: 7, patch: 0 }
1017-
Tree 1: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 11, right: 12, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.2778" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1018-
Tree 2: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 9, right: 10, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.3125" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1019-
Tree 9: Descendants(Descendants { descendants: [0, 1, 2] })
1020-
Tree 10: Descendants(Descendants { descendants: [3, 4] })
1021-
Tree 11: Descendants(Descendants { descendants: [0, 1, 2] })
1022-
Tree 12: Descendants(Descendants { descendants: [3, 4] })
1005+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 1, right: 2, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
1006+
Tree 1: Descendants(Descendants { descendants: [2, 3, 4] })
1007+
Tree 2: Descendants(Descendants { descendants: [0, 1] })
1008+
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 6, right: 7, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.1857" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1009+
Tree 4: Descendants(Descendants { descendants: [0] })
1010+
Tree 5: Descendants(Descendants { descendants: [1, 2] })
1011+
Tree 6: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 4, right: 5, normal: Leaf { header: NodeHeaderEuclidean { bias: "-0.6000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1012+
Tree 7: Descendants(Descendants { descendants: [3, 4] })
10231013
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [0.0000, 0.0000, 0.0000, 0.0000] })
10241014
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] })
10251015
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [2.0000, 0.0000, 0.0000, 0.0000] })
@@ -1035,11 +1025,13 @@ fn delete_extraneous_tree() {
10351025
insta::assert_snapshot!(handle, @r#"
10361026
==================
10371027
Dumping index 0
1038-
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [2], distance: "euclidean" }
1028+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [3], distance: "euclidean" }
10391029
Version: Version { major: 0, minor: 7, patch: 0 }
1040-
Tree 2: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 9, right: 10, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.3125" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1041-
Tree 9: Descendants(Descendants { descendants: [0, 1, 2] })
1042-
Tree 10: Descendants(Descendants { descendants: [3, 4] })
1030+
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 6, right: 7, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.1857" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1031+
Tree 4: Descendants(Descendants { descendants: [0] })
1032+
Tree 5: Descendants(Descendants { descendants: [1, 2] })
1033+
Tree 6: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 4, right: 5, normal: Leaf { header: NodeHeaderEuclidean { bias: "-0.6000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1034+
Tree 7: Descendants(Descendants { descendants: [3, 4] })
10431035
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [0.0000, 0.0000, 0.0000, 0.0000] })
10441036
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] })
10451037
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [2.0000, 0.0000, 0.0000, 0.0000] })

src/writer.rs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,16 +1364,15 @@ pub(crate) fn target_n_trees(
13641364
Some(n) => n as u64,
13651365
// In the case we never made any tree we can roughly guess how many trees we want to build in total
13661366
None => {
1367-
// We notice that increasing the dataset size by an order of magnitude requires
1368-
// doubling the number of trees to saturate recall
1369-
// That relation looks like: n_trees = 2^{log10(item_indices.len()) + b}, with an adjustment
1370-
// factor b to center the trees.
1371-
//
1372-
// To account for different embedding dimensions we notice that most providers offer
1373-
// embedings on ~O(10^3) and let `b` = log10(dim) + 1
1374-
let exp = (item_indices.len() as f64).log10() + (dimensions as f64).log10() + 1.0;
1375-
let mut nb_trees = 2f64.powf(exp).ceil() as u64;
1376-
nb_trees = nb_trees.min(item_indices.len());
1367+
// See https://github.com/meilisearch/guess-right-number-of-trees for more details on how we got this formula.
1368+
1369+
let nb_vec = item_indices.len() as f64;
1370+
let nb_trees = if nb_vec < 10_000. {
1371+
2.0_f64.powf(nb_vec.log2() - 6.0)
1372+
} else {
1373+
2.0_f64.powf(nb_vec.log10() + (dimensions as f64).log10() + (768.0 / dimensions as f64).powf(4.0))
1374+
};
1375+
let mut nb_trees = nb_trees.ceil() as u64;
13771376

13781377
// We don't want to shrink too quickly when a user remove some documents.
13791378
// We're only going to shrink if we should remove more than 20% of our trees.

0 commit comments

Comments
 (0)