Skip to content

Commit f6d0f92

Browse files
Optimize TopScoreDocCollector with TernaryLongHeap
1 parent 839425e commit f6d0f92

File tree

5 files changed

+422
-4
lines changed

5 files changed

+422
-4
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ API Changes
130130
instance instead of a Bits instance to identify document IDs to filter.
131131
(Shubham Chaudhary, Adrien Grand)
132132

133+
* GITHUB#15140: Optimize TopScoreDocCollector with TernaryLongHeap for improved performance over Binary-LongHeap. (Ramakrishna Chilaka)
134+
133135
New Features
134136
---------------------
135137
* GITHUB#15015: MultiIndexMergeScheduler: a production multi-tenant merge scheduler (Shawn Yarbrough)

lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import java.io.IOException;
2020
import org.apache.lucene.index.LeafReaderContext;
21-
import org.apache.lucene.util.LongHeap;
21+
import org.apache.lucene.util.TernaryLongHeap;
2222

2323
/**
2424
* A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link
@@ -33,15 +33,15 @@
3333
public class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
3434

3535
private final ScoreDoc after;
36-
private final LongHeap heap;
36+
private final TernaryLongHeap heap;
3737
final int totalHitsThreshold;
3838
final MaxScoreAccumulator minScoreAcc;
3939

4040
// prevents instantiation
4141
TopScoreDocCollector(
4242
int numHits, ScoreDoc after, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) {
4343
super(null);
44-
this.heap = new LongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
44+
this.heap = new TernaryLongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
4545
this.after = after;
4646
this.totalHitsThreshold = totalHitsThreshold;
4747
this.minScoreAcc = minScoreAcc;

lucene/core/src/java/org/apache/lucene/util/LongHeap.java

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* time. Put()'s and pop()'s require log(size). This heap provides unbounded growth via {@link
2525
* #push(long)}, and bounded-size insertion based on its nominal maxSize via {@link
2626
* #insertWithOverflow(long)}. The heap is a min heap, meaning that the top element is the lowest
27-
* value of the heap.
27+
* value of the heap. LongHeap implements 2-ary heap.
2828
*
2929
* @lucene.internal
3030
*/
@@ -216,4 +216,63 @@ public long get(int i) {
216216
long[] getHeapArray() {
217217
return heap;
218218
}
219+
220+
/**
221+
* Restores heap order by moving an element up the heap until it finds its proper position. Works
222+
* with heaps of any arity (number of children per node).
223+
*
224+
* @param heap the heap array (1-based indexing)
225+
* @param i the index of the element to move up
226+
* @param arity the number of children each node can have
227+
*/
228+
static void upHeap(long[] heap, int i, int arity) {
229+
final long value = heap[i]; // save bottom value
230+
while (i > 1) {
231+
// parent formula for 1-based indexing
232+
final int parent = ((i - 2) / arity) + 1;
233+
final long parentVal = heap[parent];
234+
if (value >= parentVal) break;
235+
heap[i] = parentVal; // shift parent down
236+
i = parent;
237+
}
238+
heap[i] = value; // install saved value
239+
}
240+
241+
/**
242+
* Restores heap order by moving an element down the heap until it finds its proper position.
243+
* Works with heaps of any arity (number of children per node).
244+
*
245+
* @param heap the heap array (1-based indexing)
246+
* @param i the index of the element to move down
247+
* @param size the current size of the heap
248+
* @param arity the number of children each node can have
249+
*/
250+
static void downHeap(long[] heap, int i, int size, int arity) {
251+
long value = heap[i]; // save top value
252+
for (; ; ) {
253+
// first child formula for 1-based indexing
254+
int firstChild = arity * (i - 1) + 2;
255+
if (firstChild > size) break; // i is a leaf
256+
257+
int lastChild = Math.min(firstChild + arity - 1, size);
258+
259+
// find the smallest child in [firstChild, lastChild]
260+
int best = firstChild;
261+
long bestVal = heap[firstChild];
262+
263+
for (int c = firstChild + 1; c <= lastChild; c++) {
264+
final long v = heap[c];
265+
if (v < bestVal) {
266+
bestVal = v;
267+
best = c;
268+
}
269+
}
270+
271+
if (bestVal >= value) break;
272+
273+
heap[i] = bestVal;
274+
i = best;
275+
}
276+
heap[i] = value; // install saved value
277+
}
219278
}
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.util;
18+
19+
import java.util.Arrays;
20+
21+
/**
22+
* A ternary min heap that stores longs; a primitive priority queue that like all priority queues
23+
* maintains a partial ordering of its elements such that the least element can always be found in
24+
* constant time. Put()'s and pop()'s require log_3(size). This heap provides unbounded growth via
25+
* {@link #push(long)}, and bounded-size insertion based on its nominal maxSize via {@link
26+
* #insertWithOverflow(long)}. The heap is a min heap, meaning that the top element is the lowest
27+
* value of the heap. TernaryLongHeap implements 3-ary heap.
28+
*
29+
* @lucene.internal
30+
*/
31+
public final class TernaryLongHeap {
32+
33+
private final int maxSize;
34+
35+
private long[] heap;
36+
private int size = 0;
37+
38+
/**
39+
* Constructs a heap with specified size and initializes all elements with the given value.
40+
*
41+
* @param size the number of elements to initialize in the heap.
42+
* @param initialValue the value to fill the heap with.
43+
*/
44+
public TernaryLongHeap(int size, long initialValue) {
45+
this(size <= 0 ? 1 : size);
46+
Arrays.fill(heap, 1, size + 1, initialValue);
47+
this.size = size;
48+
}
49+
50+
/**
51+
* Create an empty priority queue of the configured initial size.
52+
*
53+
* @param maxSize the maximum size of the heap, or if negative, the initial size of an unbounded
54+
* heap
55+
*/
56+
public TernaryLongHeap(int maxSize) {
57+
if (maxSize < 1 || maxSize >= ArrayUtil.MAX_ARRAY_LENGTH) {
58+
// Throw exception to prevent confusing OOME:
59+
throw new IllegalArgumentException(
60+
"maxSize must be > 0 and < " + (ArrayUtil.MAX_ARRAY_LENGTH - 1) + "; got: " + maxSize);
61+
}
62+
// NOTE: we add +1 because all access to heap is 1-based not 0-based. heap[0] is unused.
63+
final int heapSize = maxSize + 1;
64+
this.maxSize = maxSize;
65+
this.heap = new long[heapSize];
66+
}
67+
68+
/**
69+
* Adds a value in log(size) time. Grows unbounded as needed to accommodate new values.
70+
*
71+
* @return the new 'top' element in the queue.
72+
*/
73+
public long push(long element) {
74+
size++;
75+
if (size == heap.length) {
76+
heap = ArrayUtil.grow(heap, (size * 3 + 1) / 2);
77+
}
78+
heap[size] = element;
79+
LongHeap.upHeap(heap, size, 3);
80+
return heap[1];
81+
}
82+
83+
/**
84+
* Adds a value to an TernaryLongHeap in log(size) time. If the number of values would exceed the
85+
* heap's maxSize, the least value is discarded.
86+
*
87+
* @return whether the value was added (unless the heap is full, or the new value is less than the
88+
* top value)
89+
*/
90+
public boolean insertWithOverflow(long value) {
91+
if (size >= maxSize) {
92+
if (value < heap[1]) {
93+
return false;
94+
}
95+
updateTop(value);
96+
return true;
97+
}
98+
push(value);
99+
return true;
100+
}
101+
102+
/**
103+
* Returns the least element of the TernaryLongHeap in constant time. It is up to the caller to
104+
* verify that the heap is not empty; no checking is done, and if no elements have been added, 0
105+
* is returned.
106+
*/
107+
public long top() {
108+
return heap[1];
109+
}
110+
111+
/**
112+
* Removes and returns the least element of the PriorityQueue in log(size) time.
113+
*
114+
* @throws IllegalStateException if the TernaryLongHeap is empty.
115+
*/
116+
public long pop() {
117+
if (size > 0) {
118+
long result = heap[1]; // save first value
119+
heap[1] = heap[size]; // move last to first
120+
size--;
121+
LongHeap.downHeap(heap, 1, size, 3); // adjust heap
122+
return result;
123+
} else {
124+
throw new IllegalStateException("The heap is empty");
125+
}
126+
}
127+
128+
/**
129+
* Replace the top of the pq with {@code newTop}. Should be called when the top value changes.
130+
* Still log(n) worst case, but it's at least twice as fast to
131+
*
132+
* <pre class="prettyprint">
133+
* pq.updateTop(value);
134+
* </pre>
135+
*
136+
* instead of
137+
*
138+
* <pre class="prettyprint">
139+
* pq.pop();
140+
* pq.push(value);
141+
* </pre>
142+
*
143+
* Calling this method on an empty TernaryLongHeap has no visible effect.
144+
*
145+
* @param value the new element that is less than the current top.
146+
* @return the new 'top' element after shuffling the heap.
147+
*/
148+
public long updateTop(long value) {
149+
heap[1] = value;
150+
LongHeap.downHeap(heap, 1, size, 3);
151+
return heap[1];
152+
}
153+
154+
/** Returns the number of elements currently stored in the PriorityQueue. */
155+
public int size() {
156+
return size;
157+
}
158+
159+
/** Removes all entries from the PriorityQueue. */
160+
public void clear() {
161+
size = 0;
162+
}
163+
164+
public void pushAll(TernaryLongHeap other) {
165+
for (int i = 1; i <= other.size; i++) {
166+
push(other.heap[i]);
167+
}
168+
}
169+
170+
/**
171+
* Return the element at the ith location in the heap array. Use for iterating over elements when
172+
* the order doesn't matter. Note that the valid arguments range from [1, size].
173+
*/
174+
public long get(int i) {
175+
return heap[i];
176+
}
177+
178+
/**
179+
* This method returns the internal heap array.
180+
*
181+
* @lucene.internal
182+
*/
183+
// pkg-private for testing
184+
long[] getHeapArray() {
185+
return heap;
186+
}
187+
}

0 commit comments

Comments
 (0)