diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3d1093af666b..ab5db1a6006e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -261,6 +261,11 @@ Improvements * LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments. (Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir) +* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple + facet ordinals at once. This API is 2-4% faster than iteratively calling getPath. + The getPath API now throws an IAE instead of returning null if the ordinal is out of bounds. + (Gautam Worah, Mike McCandless) + Bug fixes * LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen, diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java index cde483d87f45..ec7f307b99c8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java @@ -146,10 +146,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I } LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + float[] values = new float[labelValues.length]; + for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop(); - FacetLabel child = taxoReader.getPath(ordAndValue.ord); - labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value); + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]); } return new FacetResult(dim, path, sumValues, labelValues, childCount); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index fc7124f686d7..3f1dc17dece7 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -237,10 +237,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I } LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + int[] values = new int[labelValues.length]; + for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - FacetLabel child = taxoReader.getPath(ordAndValue.ord); - labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value); + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]); } return new FacetResult(dim, path, totValue, labelValues, childCount); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java index 4a64a696e032..50c23e72019b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java @@ -212,6 +212,21 @@ public int getOrdinal(String dim, String... path) throws IOException { /** Returns the path name of the category with the given ordinal. */ public abstract FacetLabel getPath(int ordinal) throws IOException; + /** + * Returns the path names of the list of ordinals associated with different categories. + * + *
The implementation in {@link + * org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader} is generally faster than + * the default implementation which iteratively calls {@link #getPath(int)} + */ + public FacetLabel[] getBulkPath(int... ordinals) throws IOException { + FacetLabel[] facetLabels = new FacetLabel[ordinals.length]; + for (int i = 0; i < ordinals.length; i++) { + facetLabels[i] = getPath(ordinals[i]); + } + return facetLabels; + } + /** Returns the current refCount for this taxonomy reader. */ public final int getRefCount() { return refCount.get(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java index ea38d8c24d7e..66836536ffa0 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java @@ -18,10 +18,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.function.IntUnaryOperator; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.document.Document; @@ -35,6 +37,7 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; @@ -44,6 +47,7 @@ import org.apache.lucene.util.Accountables; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; /** @@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException { // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. - if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { - return null; - } + checkOrdinalBounds(ordinal); - // TODO: can we use an int-based hash impl, such as IntToObjectMap, - // wrapped as LRU? - Integer catIDInteger = Integer.valueOf(ordinal); - synchronized (categoryCache) { - FacetLabel res = categoryCache.get(catIDInteger); - if (res != null) { - return res; - } + FacetLabel[] ordinalPath = getPathFromCache(ordinal); + + if (ordinalPath[0] != null) { + return ordinalPath[0]; } int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves()); LeafReader leafReader = indexReader.leaves().get(readerIndex).reader(); - // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL); FacetLabel ret; @@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException { } synchronized (categoryCache) { - categoryCache.put(catIDInteger, ret); + categoryCache.put(ordinal, ret); } return ret; } + private FacetLabel[] getPathFromCache(int... ordinals) { + FacetLabel[] facetLabels = new FacetLabel[ordinals.length]; + // TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap, + // wrapped as LRU? + synchronized (categoryCache) { + for (int i = 0; i < ordinals.length; i++) { + facetLabels[i] = categoryCache.get(ordinals[i]); + } + } + return facetLabels; + } + + /** + * Checks if the ordinals in the array are >=0 and < {@code + * DirectoryTaxonomyReader#indexReader.maxDoc()} + * + * @param ordinals Integer array of ordinals + * @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is + * out of bounds + */ + private void checkOrdinalBounds(int... ordinals) throws IllegalArgumentException { + for (int ordinal : ordinals) { + if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { + throw new IllegalArgumentException( + "ordinal " + + ordinal + + " is out of the range of the indexReader " + + indexReader.toString() + + ". The maximum possible ordinal number is " + + (indexReader.maxDoc() - 1)); + } + } + } + + /** + * Returns an array of FacetLabels for a given array of ordinals. + * + *
This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
+ * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
+ * was created using StoredFields (with no performance gains) and uses DocValues based iteration
+ * when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
+ *
+ * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
+ * index
+ */
+ @Override
+ public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
+ ensureOpen();
+ checkOrdinalBounds(ordinals);
+
+ int ordinalsLength = ordinals.length;
+ FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
+ // remember the original positions of ordinals before they are sorted
+ int[] originalPosition = new int[ordinalsLength];
+ Arrays.setAll(originalPosition, IntUnaryOperator.identity());
+
+ getPathFromCache(ordinals);
+
+ /* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
+ new InPlaceMergeSorter() {
+ @Override
+ protected void swap(int i, int j) {
+ int x = ordinals[i];
+ ordinals[i] = ordinals[j];
+ ordinals[j] = x;
+
+ x = originalPosition[i];
+ originalPosition[i] = originalPosition[j];
+ originalPosition[j] = x;
+ }
+
+ @Override
+ public int compare(int i, int j) {
+ return Integer.compare(ordinals[i], ordinals[j]);
+ }
+ }.sort(0, ordinalsLength);
+
+ int readerIndex;
+ int leafReaderMaxDoc = 0;
+ int leafReaderDocBase = 0;
+ LeafReader leafReader;
+ LeafReaderContext leafReaderContext;
+ BinaryDocValues values = null;
+ List