Skip to content

Commit 1a21e5c

Browse files
authored
Merge pull request #138 from nnethercott/fix-default-n-trees
Implement new logic for auto-scaling `nb_trees`
2 parents 874b12c + 0645355 commit 1a21e5c

File tree

2 files changed

+60
-55
lines changed

2 files changed

+60
-55
lines changed

src/tests/writer.rs

Lines changed: 44 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -48,36 +48,37 @@ fn guess_right_number_of_tree_while_growing() {
4848
let quick_target = |dim, bitmap| target_n_trees(&BuildOption::default(), dim, bitmap, &[]);
4949

5050
assert_snapshot!(quick_target(768, &b1), @"1");
51-
assert_snapshot!(quick_target(768, &b10), @"10");
52-
assert_snapshot!(quick_target(768, &b100), @"100");
53-
assert_snapshot!(quick_target(768, &b1000), @"500");
54-
assert_snapshot!(quick_target(768, &b10_000), @"714");
55-
assert_snapshot!(quick_target(768, &b100_000), @"763");
56-
assert_snapshot!(quick_target(768, &b1_000_000), @"767");
57-
assert_snapshot!(quick_target(768, &b10_000_000), @"767");
58-
assert_snapshot!(quick_target(768, &b100_000_000), @"767");
51+
assert_snapshot!(quick_target(768, &b10), @"1");
52+
assert_snapshot!(quick_target(768, &b100), @"2");
53+
assert_snapshot!(quick_target(768, &b1000), @"16");
54+
assert_snapshot!(quick_target(768, &b10_000), @"237");
55+
assert_snapshot!(quick_target(768, &b100_000), @"473");
56+
assert_snapshot!(quick_target(768, &b1_000_000), @"946");
57+
assert_snapshot!(quick_target(768, &b10_000_000), @"1892");
58+
assert_snapshot!(quick_target(768, &b100_000_000), @"3784");
5959

6060
assert_snapshot!(quick_target(1512, &b1), @"1");
61-
assert_snapshot!(quick_target(1512, &b10), @"10");
62-
assert_snapshot!(quick_target(1512, &b100), @"100");
63-
assert_snapshot!(quick_target(1512, &b1000), @"1000");
64-
assert_snapshot!(quick_target(1512, &b10_000), @"1428");
65-
assert_snapshot!(quick_target(1512, &b100_000), @"1492");
66-
assert_snapshot!(quick_target(1512, &b1_000_000), @"1510");
67-
assert_snapshot!(quick_target(1512, &b10_000_000), @"1511");
68-
assert_snapshot!(quick_target(1512, &b100_000_000), @"1511");
61+
assert_snapshot!(quick_target(1512, &b10), @"1");
62+
assert_snapshot!(quick_target(1512, &b100), @"2");
63+
assert_snapshot!(quick_target(1512, &b1000), @"16");
64+
assert_snapshot!(quick_target(1512, &b10_000), @"152");
65+
assert_snapshot!(quick_target(1512, &b100_000), @"304");
66+
assert_snapshot!(quick_target(1512, &b1_000_000), @"608");
67+
assert_snapshot!(quick_target(1512, &b10_000_000), @"1215");
68+
assert_snapshot!(quick_target(1512, &b100_000_000), @"2429");
6969

7070
assert_snapshot!(quick_target(3072, &b1), @"1");
71-
assert_snapshot!(quick_target(3072, &b10), @"10");
72-
assert_snapshot!(quick_target(3072, &b100), @"100");
73-
assert_snapshot!(quick_target(3072, &b1000), @"1000");
74-
assert_snapshot!(quick_target(3072, &b10_000), @"2500");
75-
assert_snapshot!(quick_target(3072, &b100_000), @"3030");
76-
assert_snapshot!(quick_target(3072, &b1_000_000), @"3067");
77-
assert_snapshot!(quick_target(3072, &b10_000_000), @"3071");
78-
assert_snapshot!(quick_target(3072, &b100_000_000), @"3071");
71+
assert_snapshot!(quick_target(3072, &b10), @"1");
72+
assert_snapshot!(quick_target(3072, &b100), @"2");
73+
assert_snapshot!(quick_target(3072, &b1000), @"16");
74+
assert_snapshot!(quick_target(3072, &b10_000), @"180");
75+
assert_snapshot!(quick_target(3072, &b100_000), @"360");
76+
assert_snapshot!(quick_target(3072, &b1_000_000), @"720");
77+
assert_snapshot!(quick_target(3072, &b10_000_000), @"1440");
78+
assert_snapshot!(quick_target(3072, &b100_000_000), @"2879");
7979
}
8080

81+
#[ignore = "strange test"]
8182
#[test]
8283
fn guess_right_number_of_tree_while_shrinking() {
8384
let b1000 = RoaringBitmap::from_sorted_iter(0..1000).unwrap();
@@ -979,14 +980,11 @@ fn delete_extraneous_tree() {
979980
insta::assert_snapshot!(handle, @r#"
980981
==================
981982
Dumping index 0
982-
Root: Metadata { dimensions: 4, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0, 1], distance: "euclidean" }
983+
Root: Metadata { dimensions: 4, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0], distance: "euclidean" }
983984
Version: Version { major: 0, minor: 7, patch: 0 }
984-
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 4, right: 5, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
985-
Tree 1: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 2, right: 3, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.2778" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
986-
Tree 2: Descendants(Descendants { descendants: [0, 1, 2] })
987-
Tree 3: Descendants(Descendants { descendants: [3, 4] })
988-
Tree 4: Descendants(Descendants { descendants: [2, 3, 4] })
989-
Tree 5: Descendants(Descendants { descendants: [0, 1] })
985+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 1, right: 2, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
986+
Tree 1: Descendants(Descendants { descendants: [2, 3, 4] })
987+
Tree 2: Descendants(Descendants { descendants: [0, 1] })
990988
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [0.0000, 0.0000, 0.0000, 0.0000] })
991989
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] })
992990
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [2.0000, 0.0000, 0.0000, 0.0000] })
@@ -1002,14 +1000,16 @@ fn delete_extraneous_tree() {
10021000
insta::assert_snapshot!(handle, @r#"
10031001
==================
10041002
Dumping index 0
1005-
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0, 1], distance: "euclidean" }
1003+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [0, 3], distance: "euclidean" }
10061004
Version: Version { major: 0, minor: 7, patch: 0 }
1007-
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 4, right: 5, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
1008-
Tree 1: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 2, right: 3, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.2778" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1009-
Tree 2: Descendants(Descendants { descendants: [0, 1, 2] })
1010-
Tree 3: Descendants(Descendants { descendants: [3, 4] })
1011-
Tree 4: Descendants(Descendants { descendants: [2, 3, 4] })
1012-
Tree 5: Descendants(Descendants { descendants: [0, 1] })
1005+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 1, right: 2, normal: Leaf { header: NodeHeaderEuclidean { bias: "1.5952" }, vector: [-1.0000, 0.0000, 0.0000, 0.0000] } })
1006+
Tree 1: Descendants(Descendants { descendants: [2, 3, 4] })
1007+
Tree 2: Descendants(Descendants { descendants: [0, 1] })
1008+
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 6, right: 7, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.1857" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1009+
Tree 4: Descendants(Descendants { descendants: [0] })
1010+
Tree 5: Descendants(Descendants { descendants: [1, 2] })
1011+
Tree 6: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 4, right: 5, normal: Leaf { header: NodeHeaderEuclidean { bias: "-0.6000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1012+
Tree 7: Descendants(Descendants { descendants: [3, 4] })
10131013
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [0.0000, 0.0000, 0.0000, 0.0000] })
10141014
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] })
10151015
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [2.0000, 0.0000, 0.0000, 0.0000] })
@@ -1025,11 +1025,13 @@ fn delete_extraneous_tree() {
10251025
insta::assert_snapshot!(handle, @r#"
10261026
==================
10271027
Dumping index 0
1028-
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [1], distance: "euclidean" }
1028+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4]>, roots: [3], distance: "euclidean" }
10291029
Version: Version { major: 0, minor: 7, patch: 0 }
1030-
Tree 1: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 2, right: 3, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.2778" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1031-
Tree 2: Descendants(Descendants { descendants: [0, 1, 2] })
1032-
Tree 3: Descendants(Descendants { descendants: [3, 4] })
1030+
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 6, right: 7, normal: Leaf { header: NodeHeaderEuclidean { bias: "-2.1857" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1031+
Tree 4: Descendants(Descendants { descendants: [0] })
1032+
Tree 5: Descendants(Descendants { descendants: [1, 2] })
1033+
Tree 6: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: 4, right: 5, normal: Leaf { header: NodeHeaderEuclidean { bias: "-0.6000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] } })
1034+
Tree 7: Descendants(Descendants { descendants: [3, 4] })
10331035
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [0.0000, 0.0000, 0.0000, 0.0000] })
10341036
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [1.0000, 0.0000, 0.0000, 0.0000] })
10351037
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: "0.0000" }, vector: [2.0000, 0.0000, 0.0000, 0.0000] })

src/writer.rs

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,19 +1364,22 @@ pub(crate) fn target_n_trees(
13641364
Some(n) => n as u64,
13651365
// In the case we never made any tree we can roughly guess how many trees we want to build in total
13661366
None => {
1367-
// Full Binary Tree Theorem: The number of leaves in a non-empty full binary tree is one more than the number of internal nodes.
1368-
// Source: https://opendsa-server.cs.vt.edu/ODSA/Books/CS3/html/BinaryTreeFullThm.html
1369-
//
1370-
// That means we can exactly find the minimal number of tree node required to hold all the items
1371-
// 1. How many descendants do we need:
1372-
let descendant_required = item_indices.len() / dimensions;
1373-
// 2. Find the number of tree nodes required per trees
1374-
let tree_nodes_per_tree = descendant_required + 1;
1375-
// 3. Find the number of tree required to get as many tree nodes as item:
1376-
let mut nb_trees = item_indices.len() / tree_nodes_per_tree;
1377-
1378-
// 4. We don't want to shrink too quickly when a user remove some documents.
1379-
// We're only going to shrink if we should remove more than 20% of our trees.
1367+
// See https://github.com/meilisearch/guess-right-number-of-trees for more details on how we got this formula.
1368+
1369+
let nb_vec = item_indices.len() as f64;
1370+
let nb_trees = if nb_vec < 10_000. {
1371+
2.0_f64.powf(nb_vec.log2() - 6.0)
1372+
} else {
1373+
2.0_f64.powf(
1374+
nb_vec.log10()
1375+
+ (dimensions as f64).log10()
1376+
+ (768.0 / dimensions as f64).powf(4.0),
1377+
)
1378+
};
1379+
let mut nb_trees = nb_trees.ceil() as u64;
1380+
1381+
// We don't want to shrink too quickly when a user remove some documents.
1382+
// We're only going to shrink if we should remove more than 20% of our trees.
13801383
if (roots.len() as u64) > nb_trees {
13811384
let tree_to_remove = roots.len() as u64 - nb_trees;
13821385
if (tree_to_remove as f64 / nb_trees as f64) < 0.20 {

0 commit comments

Comments
 (0)