Skip to content

Commit ff1ea3e

Browse files
Optimize TopScoreDocCollector with TernaryLongHeap
1 parent f0d3bbf commit ff1ea3e

File tree

5 files changed

+422
-5
lines changed

5 files changed

+422
-5
lines changed

lucene/CHANGES.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ Improvements
122122

123123
Optimizations
124124
---------------------
125-
(No changes)
125+
* GITHUB#15140: Optimize TopScoreDocCollector with TernaryLongHeap for improved performance over Binary-LongHeap. (Ramakrishna Chilaka)
126126

127127
Bug Fixes
128128
---------------------

lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import java.io.IOException;
2020
import org.apache.lucene.index.LeafReaderContext;
21-
import org.apache.lucene.util.LongHeap;
21+
import org.apache.lucene.util.TernaryLongHeap;
2222

2323
/**
2424
* A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link
@@ -33,15 +33,15 @@
3333
public class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
3434

3535
private final ScoreDoc after;
36-
private final LongHeap heap;
36+
private final TernaryLongHeap heap;
3737
final int totalHitsThreshold;
3838
final MaxScoreAccumulator minScoreAcc;
3939

4040
// prevents instantiation
4141
TopScoreDocCollector(
4242
int numHits, ScoreDoc after, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) {
4343
super(null);
44-
this.heap = new LongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
44+
this.heap = new TernaryLongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
4545
this.after = after;
4646
this.totalHitsThreshold = totalHitsThreshold;
4747
this.minScoreAcc = minScoreAcc;

lucene/core/src/java/org/apache/lucene/util/LongHeap.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* time. Put()'s and pop()'s require log(size). This heap provides unbounded growth via {@link
2525
* #push(long)}, and bounded-size insertion based on its nominal maxSize via {@link
2626
* #insertWithOverflow(long)}. The heap is a min heap, meaning that the top element is the lowest
27-
* value of the heap.
27+
* value of the heap. LongHeap implements 2-ary heap.
2828
*
2929
* @lucene.internal
3030
*/
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.util;
18+
19+
import java.util.Arrays;
20+
21+
/**
22+
* A ternary min heap that stores longs; a primitive priority queue that like all priority queues
23+
* maintains a partial ordering of its elements such that the least element can always be found in
24+
* constant time. Put()'s and pop()'s require log_3(size). This heap provides unbounded growth via
25+
* {@link #push(long)}, and bounded-size insertion based on its nominal maxSize via {@link
26+
* #insertWithOverflow(long)}. The heap is a min heap, meaning that the top element is the lowest
27+
* value of the heap. TernaryLongHeap implements 3-ary heap.
28+
*
29+
* @lucene.internal
30+
*/
31+
public final class TernaryLongHeap {
32+
33+
private final int maxSize;
34+
35+
private long[] heap;
36+
private int size = 0;
37+
private static final int ARITY = 3;
38+
39+
/**
40+
* Constructs a heap with specified size and initializes all elements with the given value.
41+
*
42+
* @param size the number of elements to initialize in the heap.
43+
* @param initialValue the value to fill the heap with.
44+
*/
45+
public TernaryLongHeap(int size, long initialValue) {
46+
this(size <= 0 ? 1 : size);
47+
Arrays.fill(heap, 1, size + 1, initialValue);
48+
this.size = size;
49+
}
50+
51+
/**
52+
* Create an empty priority queue of the configured initial size.
53+
*
54+
* @param maxSize the maximum size of the heap, or if negative, the initial size of an unbounded
55+
* heap
56+
*/
57+
public TernaryLongHeap(int maxSize) {
58+
if (maxSize < 1 || maxSize >= ArrayUtil.MAX_ARRAY_LENGTH) {
59+
// Throw exception to prevent confusing OOME:
60+
throw new IllegalArgumentException(
61+
"maxSize must be > 0 and < " + (ArrayUtil.MAX_ARRAY_LENGTH - 1) + "; got: " + maxSize);
62+
}
63+
// NOTE: we add +1 because all access to heap is 1-based not 0-based. heap[0] is unused.
64+
final int heapSize = maxSize + 1;
65+
this.maxSize = maxSize;
66+
this.heap = new long[heapSize];
67+
}
68+
69+
/**
70+
* Adds a value in log(size) time. Grows unbounded as needed to accommodate new values.
71+
*
72+
* @return the new 'top' element in the queue.
73+
*/
74+
public long push(long element) {
75+
size++;
76+
if (size == heap.length) {
77+
heap = ArrayUtil.grow(heap, (size * 3 + 1) / 2);
78+
}
79+
heap[size] = element;
80+
TernaryLongHeap.upHeap(heap, size, ARITY);
81+
return heap[1];
82+
}
83+
84+
/**
85+
* Adds a value to an TernaryLongHeap in log(size) time. If the number of values would exceed the
86+
* heap's maxSize, the least value is discarded.
87+
*
88+
* @return whether the value was added (unless the heap is full, or the new value is less than the
89+
* top value)
90+
*/
91+
public boolean insertWithOverflow(long value) {
92+
if (size >= maxSize) {
93+
if (value < heap[1]) {
94+
return false;
95+
}
96+
updateTop(value);
97+
return true;
98+
}
99+
push(value);
100+
return true;
101+
}
102+
103+
/**
104+
* Returns the least element of the TernaryLongHeap in constant time. It is up to the caller to
105+
* verify that the heap is not empty; no checking is done, and if no elements have been added, 0
106+
* is returned.
107+
*/
108+
public long top() {
109+
return heap[1];
110+
}
111+
112+
/**
113+
* Removes and returns the least element of the PriorityQueue in log(size) time.
114+
*
115+
* @throws IllegalStateException if the TernaryLongHeap is empty.
116+
*/
117+
public long pop() {
118+
if (size > 0) {
119+
long result = heap[1]; // save first value
120+
heap[1] = heap[size]; // move last to first
121+
size--;
122+
TernaryLongHeap.downHeap(heap, 1, size, ARITY); // adjust heap
123+
return result;
124+
} else {
125+
throw new IllegalStateException("The heap is empty");
126+
}
127+
}
128+
129+
/**
130+
* Replace the top of the pq with {@code newTop}. Should be called when the top value changes.
131+
* Still log(n) worst case, but it's at least twice as fast to
132+
*
133+
* <pre class="prettyprint">
134+
* pq.updateTop(value);
135+
* </pre>
136+
*
137+
* <p>instead of
138+
*
139+
* <pre class="prettyprint">
140+
* pq.pop();
141+
* pq.push(value);
142+
* </pre>
143+
*
144+
* <p>Calling this method on an empty TernaryLongHeap has no visible effect.
145+
*
146+
* @param value the new element that is less than the current top.
147+
* @return the new 'top' element after shuffling the heap.
148+
*/
149+
public long updateTop(long value) {
150+
heap[1] = value;
151+
TernaryLongHeap.downHeap(heap, 1, size, ARITY);
152+
return heap[1];
153+
}
154+
155+
/** Returns the number of elements currently stored in the PriorityQueue. */
156+
public int size() {
157+
return size;
158+
}
159+
160+
/** Removes all entries from the PriorityQueue. */
161+
public void clear() {
162+
size = 0;
163+
}
164+
165+
public void pushAll(TernaryLongHeap other) {
166+
for (int i = 1; i <= other.size; i++) {
167+
push(other.heap[i]);
168+
}
169+
}
170+
171+
/**
172+
* Return the element at the ith location in the heap array. Use for iterating over elements when
173+
* the order doesn't matter. Note that the valid arguments range from [1, size].
174+
*/
175+
public long get(int i) {
176+
return heap[i];
177+
}
178+
179+
/**
180+
* This method returns the internal heap array.
181+
*
182+
* @lucene.internal
183+
*/
184+
// pkg-private for testing
185+
long[] getHeapArray() {
186+
return heap;
187+
}
188+
189+
/**
190+
* Restores heap order by moving an element up the heap until it finds its proper position. Works
191+
* with heaps of any arity (number of children per node).
192+
*
193+
* @param heap the heap array (1-based indexing)
194+
* @param i the index of the element to move up
195+
* @param arity the number of children each node can have
196+
*/
197+
static void upHeap(long[] heap, int i, int arity) {
198+
final long value = heap[i]; // save bottom value
199+
while (i > 1) {
200+
// parent formula for 1-based indexing
201+
final int parent = ((i - 2) / arity) + 1;
202+
final long parentVal = heap[parent];
203+
if (value >= parentVal) break;
204+
heap[i] = parentVal; // shift parent down
205+
i = parent;
206+
}
207+
heap[i] = value; // install saved value
208+
}
209+
210+
/**
211+
* Restores heap order by moving an element down the heap until it finds its proper position.
212+
* Works with heaps of any arity (number of children per node).
213+
*
214+
* @param heap the heap array (1-based indexing)
215+
* @param i the index of the element to move down
216+
* @param size the current size of the heap
217+
* @param arity the number of children each node can have
218+
*/
219+
static void downHeap(long[] heap, int i, int size, int arity) {
220+
long value = heap[i]; // save top value
221+
for (; ; ) {
222+
// first child formula for 1-based indexing
223+
int firstChild = arity * (i - 1) + 2;
224+
if (firstChild > size) break; // i is a leaf
225+
226+
int lastChild = Math.min(firstChild + arity - 1, size);
227+
228+
// find the smallest child in [firstChild, lastChild]
229+
int best = firstChild;
230+
long bestVal = heap[firstChild];
231+
232+
for (int c = firstChild + 1; c <= lastChild; c++) {
233+
final long v = heap[c];
234+
if (v < bestVal) {
235+
bestVal = v;
236+
best = c;
237+
}
238+
}
239+
240+
if (bestVal >= value) break;
241+
242+
heap[i] = bestVal;
243+
i = best;
244+
}
245+
heap[i] = value; // install saved value
246+
}
247+
}

0 commit comments

Comments
 (0)