Skip to content

Commit 73a3db9

Browse files
committed
LUCENE-10288: Check BKD tree shape for lucene pre-8.6 1D indexes (#607)
Adds efficient logic to compute if a tree is balanced or unbalanced for indexes created before Lucene 8.6
1 parent 00a7d5f commit 73a3db9

File tree

1 file changed

+63
-7
lines changed

1 file changed

+63
-7
lines changed

lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ public class BKDReader extends PointValues {
4444
final long minLeafBlockFP;
4545

4646
final IndexInput packedIndex;
47+
// if true, the tree is a legacy balanced tree
48+
private final boolean isTreeBalanced;
4749

4850
/**
4951
* Caller must pre-seek the provided {@link IndexInput} to the index location that {@link
@@ -105,6 +107,52 @@ public BKDReader(IndexInput metaIn, IndexInput indexIn, IndexInput dataIn) throw
105107
}
106108
this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, numIndexBytes);
107109
this.in = dataIn;
110+
// for only one leaf, balanced and unbalanced trees can be handled the same way
111+
// we set it to unbalanced.
112+
this.isTreeBalanced = numLeaves != 1 && isTreeBalanced();
113+
}
114+
115+
private boolean isTreeBalanced() throws IOException {
116+
if (version >= BKDWriter.VERSION_META_FILE) {
117+
// since lucene 8.6 all trees are unbalanced.
118+
return false;
119+
}
120+
if (config.numDims > 1) {
121+
// high dimensional tree in pre-8.6 indices are balanced.
122+
assert 1 << MathUtil.log(numLeaves, 2) == numLeaves;
123+
return true;
124+
}
125+
if (1 << MathUtil.log(numLeaves, 2) != numLeaves) {
126+
// if we don't have enough leaves to fill the last level then it is unbalanced
127+
return false;
128+
}
129+
// count of the last node for unbalanced trees
130+
final int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode);
131+
// navigate to last node
132+
PointTree pointTree = getPointTree();
133+
do {
134+
while (pointTree.moveToSibling()) {}
135+
} while (pointTree.moveToChild());
136+
// count number of docs in the node
137+
final int[] count = new int[] {0};
138+
pointTree.visitDocIDs(
139+
new IntersectVisitor() {
140+
@Override
141+
public void visit(int docID) {
142+
count[0]++;
143+
}
144+
145+
@Override
146+
public void visit(int docID, byte[] packedValue) {
147+
throw new AssertionError();
148+
}
149+
150+
@Override
151+
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
152+
throw new AssertionError();
153+
}
154+
});
155+
return count[0] != lastLeafNodePointCount;
108156
}
109157

110158
@Override
@@ -117,7 +165,8 @@ public PointTree getPointTree() throws IOException {
117165
version,
118166
pointCount,
119167
minPackedValue,
120-
maxPackedValue);
168+
maxPackedValue,
169+
isTreeBalanced);
121170
}
122171

123172
private static class BKDPointTree implements PointTree {
@@ -168,6 +217,8 @@ private static class BKDPointTree implements PointTree {
168217
scratchMaxIndexPackedValue;
169218
private final int[] commonPrefixLengths;
170219
private final BKDReaderDocIDSetIterator scratchIterator;
220+
// if true the tree is balanced, otherwise unbalanced
221+
private final boolean isTreeBalanced;
171222

172223
private BKDPointTree(
173224
IndexInput innerNodes,
@@ -177,7 +228,8 @@ private BKDPointTree(
177228
int version,
178229
long pointCount,
179230
byte[] minPackedValue,
180-
byte[] maxPackedValue)
231+
byte[] maxPackedValue,
232+
boolean isTreeBalanced)
181233
throws IOException {
182234
this(
183235
innerNodes,
@@ -194,7 +246,8 @@ private BKDPointTree(
194246
new byte[config.packedBytesLength],
195247
new byte[config.packedIndexBytesLength],
196248
new byte[config.packedIndexBytesLength],
197-
new int[config.numDims]);
249+
new int[config.numDims],
250+
isTreeBalanced);
198251
// read root node
199252
readNodeData(false);
200253
}
@@ -214,12 +267,14 @@ private BKDPointTree(
214267
byte[] scratchDataPackedValue,
215268
byte[] scratchMinIndexPackedValue,
216269
byte[] scratchMaxIndexPackedValue,
217-
int[] commonPrefixLengths) {
270+
int[] commonPrefixLengths,
271+
boolean isTreeBalanced) {
218272
this.config = config;
219273
this.version = version;
220274
this.nodeID = nodeID;
221275
this.nodeRoot = nodeID;
222276
this.level = level;
277+
this.isTreeBalanced = isTreeBalanced;
223278
leafNodeOffset = numLeaves;
224279
this.innerNodes = innerNodes;
225280
this.leafNodes = leafNodes;
@@ -268,7 +323,8 @@ public PointTree clone() {
268323
scratchDataPackedValue,
269324
scratchMinIndexPackedValue,
270325
scratchMaxIndexPackedValue,
271-
commonPrefixLengths);
326+
commonPrefixLengths,
327+
isTreeBalanced);
272328
index.leafBlockFPStack[index.level] = leafBlockFPStack[level];
273329
if (isLeafNode() == false) {
274330
// copy node data
@@ -452,8 +508,8 @@ public long size() {
452508
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
453509
}
454510
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
455-
if (version < BKDWriter.VERSION_META_FILE && config.numDims > 1) {
456-
// before lucene 8.6, high dimensional trees were constructed as fully balanced trees.
511+
if (isTreeBalanced) {
512+
// before lucene 8.6, trees might have been constructed as fully balanced trees.
457513
return sizeFromBalancedTree(leftMostLeafNode, rightMostLeafNode);
458514
}
459515
// size for an unbalanced tree.

0 commit comments

Comments
 (0)