Skip to content

Commit 1036c70

Browse files
gautamworah96Gautam Worah
andauthored
LUCENE-9476: Add getBulkPath API to DirectoryTaxonomyReader for faster ordinal -> FacetLabel lookup (#179)
Co-authored-by: Gautam Worah <[email protected]>
1 parent 3423243 commit 1036c70

File tree

9 files changed

+283
-22
lines changed

9 files changed

+283
-22
lines changed

lucene/CHANGES.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,11 @@ Improvements
261261
* LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
262262
(Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
263263

264+
* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
265+
facet ordinals at once. This API is 2-4% faster than iteratively calling getPath.
266+
The getPath API now throws an IAE instead of returning null if the ordinal is out of bounds.
267+
(Gautam Worah, Mike McCandless)
268+
264269
Bug fixes
265270

266271
* LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen,

lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
146146
}
147147

148148
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
149+
int[] ordinals = new int[labelValues.length];
150+
float[] values = new float[labelValues.length];
151+
149152
for (int i = labelValues.length - 1; i >= 0; i--) {
150153
TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop();
151-
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
152-
labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
154+
ordinals[i] = ordAndValue.ord;
155+
values[i] = ordAndValue.value;
156+
}
157+
158+
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
159+
for (int i = 0; i < labelValues.length; i++) {
160+
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
153161
}
154162

155163
return new FacetResult(dim, path, sumValues, labelValues, childCount);

lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
237237
}
238238

239239
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
240+
int[] ordinals = new int[labelValues.length];
241+
int[] values = new int[labelValues.length];
242+
240243
for (int i = labelValues.length - 1; i >= 0; i--) {
241244
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
242-
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
243-
labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
245+
ordinals[i] = ordAndValue.ord;
246+
values[i] = ordAndValue.value;
247+
}
248+
249+
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
250+
for (int i = 0; i < labelValues.length; i++) {
251+
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
244252
}
245253

246254
return new FacetResult(dim, path, totValue, labelValues, childCount);

lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,21 @@ public int getOrdinal(String dim, String... path) throws IOException {
212212
/** Returns the path name of the category with the given ordinal. */
213213
public abstract FacetLabel getPath(int ordinal) throws IOException;
214214

215+
/**
216+
* Returns the path names of the list of ordinals associated with different categories.
217+
*
218+
* <p>The implementation in {@link
219+
* org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader} is generally faster than
220+
* the default implementation which iteratively calls {@link #getPath(int)}
221+
*/
222+
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
223+
FacetLabel[] facetLabels = new FacetLabel[ordinals.length];
224+
for (int i = 0; i < ordinals.length; i++) {
225+
facetLabels[i] = getPath(ordinals[i]);
226+
}
227+
return facetLabels;
228+
}
229+
215230
/** Returns the current refCount for this taxonomy reader. */
216231
public final int getRefCount() {
217232
return refCount.get();

lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java

Lines changed: 140 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818

1919
import java.io.IOException;
2020
import java.util.ArrayList;
21+
import java.util.Arrays;
2122
import java.util.Collection;
2223
import java.util.Collections;
2324
import java.util.List;
2425
import java.util.Map;
26+
import java.util.function.IntUnaryOperator;
2527
import java.util.logging.Level;
2628
import java.util.logging.Logger;
2729
import org.apache.lucene.document.Document;
@@ -35,6 +37,7 @@
3537
import org.apache.lucene.index.DirectoryReader;
3638
import org.apache.lucene.index.IndexWriter;
3739
import org.apache.lucene.index.LeafReader;
40+
import org.apache.lucene.index.LeafReaderContext;
3841
import org.apache.lucene.index.MultiTerms;
3942
import org.apache.lucene.index.PostingsEnum;
4043
import org.apache.lucene.index.ReaderUtil;
@@ -44,6 +47,7 @@
4447
import org.apache.lucene.util.Accountables;
4548
import org.apache.lucene.util.BytesRef;
4649
import org.apache.lucene.util.IOUtils;
50+
import org.apache.lucene.util.InPlaceMergeSorter;
4751
import org.apache.lucene.util.RamUsageEstimator;
4852

4953
/**
@@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
318322
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
319323
// instance recognizes. Therefore we do this check up front, before we hit
320324
// the cache.
321-
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
322-
return null;
323-
}
325+
checkOrdinalBounds(ordinal);
324326

325-
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
326-
// wrapped as LRU?
327-
Integer catIDInteger = Integer.valueOf(ordinal);
328-
synchronized (categoryCache) {
329-
FacetLabel res = categoryCache.get(catIDInteger);
330-
if (res != null) {
331-
return res;
332-
}
327+
FacetLabel[] ordinalPath = getPathFromCache(ordinal);
328+
329+
if (ordinalPath[0] != null) {
330+
return ordinalPath[0];
333331
}
334332

335333
int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
336334
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
337-
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
338335
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
339336

340337
FacetLabel ret;
@@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException {
351348
}
352349

353350
synchronized (categoryCache) {
354-
categoryCache.put(catIDInteger, ret);
351+
categoryCache.put(ordinal, ret);
355352
}
356353

357354
return ret;
358355
}
359356

357+
private FacetLabel[] getPathFromCache(int... ordinals) {
358+
FacetLabel[] facetLabels = new FacetLabel[ordinals.length];
359+
// TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap,
360+
// wrapped as LRU?
361+
synchronized (categoryCache) {
362+
for (int i = 0; i < ordinals.length; i++) {
363+
facetLabels[i] = categoryCache.get(ordinals[i]);
364+
}
365+
}
366+
return facetLabels;
367+
}
368+
369+
/**
370+
* Checks if the ordinals in the array are >=0 and < {@code
371+
* DirectoryTaxonomyReader#indexReader.maxDoc()}
372+
*
373+
* @param ordinals Integer array of ordinals
374+
* @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is
375+
* out of bounds
376+
*/
377+
private void checkOrdinalBounds(int... ordinals) throws IllegalArgumentException {
378+
for (int ordinal : ordinals) {
379+
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
380+
throw new IllegalArgumentException(
381+
"ordinal "
382+
+ ordinal
383+
+ " is out of the range of the indexReader "
384+
+ indexReader.toString()
385+
+ ". The maximum possible ordinal number is "
386+
+ (indexReader.maxDoc() - 1));
387+
}
388+
}
389+
}
390+
391+
/**
392+
* Returns an array of FacetLabels for a given array of ordinals.
393+
*
394+
* <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
395+
* ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
396+
* was created using StoredFields (with no performance gains) and uses DocValues based iteration
397+
* when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
398+
*
399+
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
400+
* index
401+
*/
402+
@Override
403+
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
404+
ensureOpen();
405+
checkOrdinalBounds(ordinals);
406+
407+
int ordinalsLength = ordinals.length;
408+
FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
409+
// remember the original positions of ordinals before they are sorted
410+
int[] originalPosition = new int[ordinalsLength];
411+
Arrays.setAll(originalPosition, IntUnaryOperator.identity());
412+
413+
getPathFromCache(ordinals);
414+
415+
/* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
416+
new InPlaceMergeSorter() {
417+
@Override
418+
protected void swap(int i, int j) {
419+
int x = ordinals[i];
420+
ordinals[i] = ordinals[j];
421+
ordinals[j] = x;
422+
423+
x = originalPosition[i];
424+
originalPosition[i] = originalPosition[j];
425+
originalPosition[j] = x;
426+
}
427+
428+
@Override
429+
public int compare(int i, int j) {
430+
return Integer.compare(ordinals[i], ordinals[j]);
431+
}
432+
}.sort(0, ordinalsLength);
433+
434+
int readerIndex;
435+
int leafReaderMaxDoc = 0;
436+
int leafReaderDocBase = 0;
437+
LeafReader leafReader;
438+
LeafReaderContext leafReaderContext;
439+
BinaryDocValues values = null;
440+
List<Integer> uncachedOrdinalPositions = new ArrayList<>();
441+
442+
for (int i = 0; i < ordinalsLength; i++) {
443+
if (bulkPath[originalPosition[i]] == null) {
444+
/*
445+
If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
446+
Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
447+
(which is the maxDoc of the specific leaf)
448+
*/
449+
if (values == null || ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc) {
450+
451+
readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
452+
leafReaderContext = indexReader.leaves().get(readerIndex);
453+
leafReader = leafReaderContext.reader();
454+
leafReaderMaxDoc = leafReader.maxDoc();
455+
leafReaderDocBase = leafReaderContext.docBase;
456+
values = leafReader.getBinaryDocValues(Consts.FULL);
457+
458+
/*
459+
If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
460+
*/
461+
if (values == null) {
462+
return super.getBulkPath(ordinals);
463+
}
464+
}
465+
// values is leaf specific so you only advance till you reach the target within the leaf
466+
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
467+
assert success;
468+
bulkPath[originalPosition[i]] =
469+
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
470+
471+
uncachedOrdinalPositions.add(i);
472+
}
473+
}
474+
475+
if (uncachedOrdinalPositions.isEmpty() == false) {
476+
synchronized (categoryCache) {
477+
for (int i : uncachedOrdinalPositions) {
478+
// add the value to the categoryCache after computation
479+
categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
480+
}
481+
}
482+
}
483+
484+
return bulkPath;
485+
}
486+
360487
@Override
361488
public int getSize() {
362489
ensureOpen();

lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
366366
}
367367
}
368368
// (also test invalid ordinals:)
369-
assertNull(tr.getPath(-1));
370-
assertNull(tr.getPath(tr.getSize()));
371-
assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
369+
expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
370+
expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
371+
expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));
372372

373373
// test TaxonomyReader.getOrdinal():
374374
for (int i = 1; i < expectedCategories.length; i++) {

lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,28 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
8989
dir.close();
9090
}
9191

92+
// Opens up a pre-existing index and tries to run getBulkPath on it
93+
public void testGetBulkPathOnOlderCodec() throws Exception {
94+
Path indexDir = createTempDir(oldTaxonomyIndexName);
95+
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
96+
Directory dir = newFSDirectory(indexDir);
97+
98+
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
99+
DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);
100+
101+
FacetLabel[] facetLabels = {
102+
new FacetLabel("a"), new FacetLabel("b"),
103+
};
104+
int[] ordinals =
105+
new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
106+
// The zip file already contains a category "a" and a category "b" stored with the older
107+
// StoredFields codec
108+
assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
109+
reader.close();
110+
writer.close();
111+
dir.close();
112+
}
113+
92114
// Used to create a fresh taxonomy index with StoredFields
93115
@Ignore
94116
public void testCreateOldTaxonomy() throws IOException {

0 commit comments

Comments
 (0)