diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3d1093af666b..ab5db1a6006e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -261,6 +261,11 @@ Improvements * LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments. (Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir) +* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple + facet ordinals at once. This API is 2-4% faster than iteratively calling getPath. + The getPath API now throws an IAE instead of returning null if the ordinal is out of bounds. + (Gautam Worah, Mike McCandless) + Bug fixes * LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen, diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java index cde483d87f45..ec7f307b99c8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java @@ -146,10 +146,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I } LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + float[] values = new float[labelValues.length]; + for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop(); - FacetLabel child = taxoReader.getPath(ordAndValue.ord); - labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value); + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]); } return new FacetResult(dim, path, sumValues, labelValues, childCount); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index fc7124f686d7..3f1dc17dece7 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -237,10 +237,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I } LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + int[] values = new int[labelValues.length]; + for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - FacetLabel child = taxoReader.getPath(ordAndValue.ord); - labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value); + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]); } return new FacetResult(dim, path, totValue, labelValues, childCount); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java index 4a64a696e032..50c23e72019b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java @@ -212,6 +212,21 @@ public int getOrdinal(String dim, String... path) throws IOException { /** Returns the path name of the category with the given ordinal. */ public abstract FacetLabel getPath(int ordinal) throws IOException; + /** + * Returns the path names of the list of ordinals associated with different categories. + * + *

The implementation in {@link + * org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader} is generally faster than + * the default implementation which iteratively calls {@link #getPath(int)} + */ + public FacetLabel[] getBulkPath(int... ordinals) throws IOException { + FacetLabel[] facetLabels = new FacetLabel[ordinals.length]; + for (int i = 0; i < ordinals.length; i++) { + facetLabels[i] = getPath(ordinals[i]); + } + return facetLabels; + } + /** Returns the current refCount for this taxonomy reader. */ public final int getRefCount() { return refCount.get(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java index ea38d8c24d7e..66836536ffa0 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java @@ -18,10 +18,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.function.IntUnaryOperator; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.document.Document; @@ -35,6 +37,7 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; @@ -44,6 +47,7 @@ import org.apache.lucene.util.Accountables; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; /** @@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException { // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. - if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { - return null; - } + checkOrdinalBounds(ordinal); - // TODO: can we use an int-based hash impl, such as IntToObjectMap, - // wrapped as LRU? - Integer catIDInteger = Integer.valueOf(ordinal); - synchronized (categoryCache) { - FacetLabel res = categoryCache.get(catIDInteger); - if (res != null) { - return res; - } + FacetLabel[] ordinalPath = getPathFromCache(ordinal); + + if (ordinalPath[0] != null) { + return ordinalPath[0]; } int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves()); LeafReader leafReader = indexReader.leaves().get(readerIndex).reader(); - // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL); FacetLabel ret; @@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException { } synchronized (categoryCache) { - categoryCache.put(catIDInteger, ret); + categoryCache.put(ordinal, ret); } return ret; } + private FacetLabel[] getPathFromCache(int... ordinals) { + FacetLabel[] facetLabels = new FacetLabel[ordinals.length]; + // TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap, + // wrapped as LRU? + synchronized (categoryCache) { + for (int i = 0; i < ordinals.length; i++) { + facetLabels[i] = categoryCache.get(ordinals[i]); + } + } + return facetLabels; + } + + /** + * Checks if the ordinals in the array are >=0 and < {@code + * DirectoryTaxonomyReader#indexReader.maxDoc()} + * + * @param ordinals Integer array of ordinals + * @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is + * out of bounds + */ + private void checkOrdinalBounds(int... ordinals) throws IllegalArgumentException { + for (int ordinal : ordinals) { + if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { + throw new IllegalArgumentException( + "ordinal " + + ordinal + + " is out of the range of the indexReader " + + indexReader.toString() + + ". The maximum possible ordinal number is " + + (indexReader.maxDoc() - 1)); + } + } + } + + /** + * Returns an array of FacetLabels for a given array of ordinals. + * + *

This API is generally faster than iteratively calling {@link #getPath(int)} over an array of + * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index + * was created using StoredFields (with no performance gains) and uses DocValues based iteration + * when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0 + * + * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy + * index + */ + @Override + public FacetLabel[] getBulkPath(int... ordinals) throws IOException { + ensureOpen(); + checkOrdinalBounds(ordinals); + + int ordinalsLength = ordinals.length; + FacetLabel[] bulkPath = new FacetLabel[ordinalsLength]; + // remember the original positions of ordinals before they are sorted + int[] originalPosition = new int[ordinalsLength]; + Arrays.setAll(originalPosition, IntUnaryOperator.identity()); + + getPathFromCache(ordinals); + + /* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */ + new InPlaceMergeSorter() { + @Override + protected void swap(int i, int j) { + int x = ordinals[i]; + ordinals[i] = ordinals[j]; + ordinals[j] = x; + + x = originalPosition[i]; + originalPosition[i] = originalPosition[j]; + originalPosition[j] = x; + } + + @Override + public int compare(int i, int j) { + return Integer.compare(ordinals[i], ordinals[j]); + } + }.sort(0, ordinalsLength); + + int readerIndex; + int leafReaderMaxDoc = 0; + int leafReaderDocBase = 0; + LeafReader leafReader; + LeafReaderContext leafReaderContext; + BinaryDocValues values = null; + List uncachedOrdinalPositions = new ArrayList<>(); + + for (int i = 0; i < ordinalsLength; i++) { + if (bulkPath[originalPosition[i]] == null) { + /* + If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal. + Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc + (which is the maxDoc of the specific leaf) + */ + if (values == null || ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc) { + + readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves()); + leafReaderContext = indexReader.leaves().get(readerIndex); + leafReader = leafReaderContext.reader(); + leafReaderMaxDoc = leafReader.maxDoc(); + leafReaderDocBase = leafReaderContext.docBase; + values = leafReader.getBinaryDocValues(Consts.FULL); + + /* + If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null + */ + if (values == null) { + return super.getBulkPath(ordinals); + } + } + // values is leaf specific so you only advance till you reach the target within the leaf + boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase); + assert success; + bulkPath[originalPosition[i]] = + new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString())); + + uncachedOrdinalPositions.add(i); + } + } + + if (uncachedOrdinalPositions.isEmpty() == false) { + synchronized (categoryCache) { + for (int i : uncachedOrdinalPositions) { + // add the value to the categoryCache after computation + categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]); + } + } + } + + return bulkPath; + } + @Override public int getSize() { ensureOpen(); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java index 4664b23a24e9..966a0ac6fa76 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception { } } // (also test invalid ordinals:) - assertNull(tr.getPath(-1)); - assertNull(tr.getPath(tr.getSize())); - assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL)); + expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1)); + expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize())); + expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL)); // test TaxonomyReader.getOrdinal(): for (int i = 1; i < expectedCategories.length; i++) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java index 0f05d323a968..a702ecec7230 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java @@ -89,6 +89,28 @@ private void createNewTaxonomyIndex(String dirName) throws IOException { dir.close(); } + // Opens up a pre-existing index and tries to run getBulkPath on it + public void testGetBulkPathOnOlderCodec() throws Exception { + Path indexDir = createTempDir(oldTaxonomyIndexName); + TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir); + Directory dir = newFSDirectory(indexDir); + + DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir); + DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer); + + FacetLabel[] facetLabels = { + new FacetLabel("a"), new FacetLabel("b"), + }; + int[] ordinals = + new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])}; + // The zip file already contains a category "a" and a category "b" stored with the older + // StoredFields codec + assertArrayEquals(facetLabels, reader.getBulkPath(ordinals)); + reader.close(); + writer.close(); + dir.close(); + } + // Used to create a fresh taxonomy index with StoredFields @Ignore public void testCreateOldTaxonomy() throws IOException { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java index c9338b417944..3da05830e1e7 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.facet.taxonomy.directory; +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; @@ -413,7 +414,7 @@ public void testOpenIfChangedReuse() throws Exception { // check that r1 doesn't see cp_b assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b)); - assertNull(r1.getPath(2)); + expectThrows(IllegalArgumentException.class, () -> r1.getPath(2)); r1.close(); r2.close(); @@ -567,4 +568,78 @@ public void testAccountable() throws Exception { taxoReader.close(); dir.close(); } + + public void testCallingBulkPathReturnsCorrectResult() throws Exception { + Directory src = newDirectory(); + DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src); + String randomArray[] = new String[random().nextInt(1000)]; + // adding a smaller bound on ints ensures that we will have some duplicate ordinals in random + // test cases + Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt(500))); + + FacetLabel allPaths[] = new FacetLabel[randomArray.length]; + int allOrdinals[] = new int[randomArray.length]; + + for (int i = 0; i < randomArray.length; i++) { + allPaths[i] = new FacetLabel(randomArray[i]); + w.addCategory(allPaths[i]); + // add random commits to create multiple segments in the index + if (random().nextBoolean()) { + w.commit(); + } + } + w.commit(); + w.close(); + + DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src); + + for (int i = 0; i < allPaths.length; i++) { + allOrdinals[i] = r1.getOrdinal(allPaths[i]); + } + + // create multiple threads to check result correctness and thread contention in the cache + Thread[] addThreads = new Thread[RandomNumbers.randomIntBetween(random(), 1, 12)]; + for (int z = 0; z < addThreads.length; z++) { + addThreads[z] = + new Thread() { + @Override + public void run() { + // each thread iterates for numThreadIterations times + int numThreadIterations = random().nextInt(10); + for (int threadIterations = 0; + threadIterations < numThreadIterations; + threadIterations++) { + + // length of the FacetLabel array that we are going to check + int numOfOrdinalsToCheck = random().nextInt(allOrdinals.length); + int[] ordinals = new int[numOfOrdinalsToCheck]; + FacetLabel[] path = new FacetLabel[numOfOrdinalsToCheck]; + + for (int i = 0; i < numOfOrdinalsToCheck; i++) { + // we deliberately allow it to choose repeat indexes as this will exercise the + // cache + int ordinalIndex = random().nextInt(allOrdinals.length); + ordinals[i] = allOrdinals[ordinalIndex]; + path[i] = allPaths[ordinalIndex]; + } + + try { + // main check for correctness is done here + assertArrayEquals(path, r1.getBulkPath(ordinals)); + } catch (IOException e) { + // this should ideally never occur, but if it does just rethrow the error to the + // caller + throw new RuntimeException(e); + } + } + } + }; + } + + for (Thread t : addThreads) t.start(); + for (Thread t : addThreads) t.join(); + + r1.close(); + src.close(); + } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java index dc92ee753c61..6483c52d3204 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.facet.taxonomy.directory; +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; import java.util.HashMap; import java.util.Map; @@ -314,7 +315,7 @@ public void testConcurrency() throws Exception { System.out.println("TEST: use cache=" + cache); } final DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache); - Thread[] addThreads = new Thread[atLeast(4)]; + Thread[] addThreads = new Thread[RandomNumbers.randomIntBetween(random(), 1, 12)]; for (int z = 0; z < addThreads.length; z++) { addThreads[z] = new Thread() {