LUCENE-9476: Add getBulkPath API to DirectoryTaxonomyReader for faster ordinal -> FacetLabel lookup (#179)

gautamworah96 · Gautam Worah · web-flow · commit 1036c708db74 · 2021-09-02T07:54:31.000-04:00
Co-authored-by: Gautam Worah &lt;gauworah@amazon.com&gt;
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -261,6 +261,11 @@ Improvements
 * LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
   (Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
 
+* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
+  facet ordinals at once. This API is 2-4% faster than iteratively calling getPath.
+  The getPath API now throws an IAE instead of returning null if the ordinal is out of bounds.
+  (Gautam Worah, Mike McCandless)
+
 Bug fixes
 
 * LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen,
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java
@@ -146,10 +146,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
     }
 
     LabelAndValue[] labelValues = new LabelAndValue[q.size()];
+    int[] ordinals = new int[labelValues.length];
+    float[] values = new float[labelValues.length];
+
     for (int i = labelValues.length - 1; i >= 0; i--) {
       TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop();
-      FacetLabel child = taxoReader.getPath(ordAndValue.ord);
-      labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
+      ordinals[i] = ordAndValue.ord;
+      values[i] = ordAndValue.value;
+    }
+
+    FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
+    for (int i = 0; i < labelValues.length; i++) {
+      labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
     }
 
     return new FacetResult(dim, path, sumValues, labelValues, childCount);
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java
@@ -237,10 +237,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
     }
 
     LabelAndValue[] labelValues = new LabelAndValue[q.size()];
+    int[] ordinals = new int[labelValues.length];
+    int[] values = new int[labelValues.length];
+
     for (int i = labelValues.length - 1; i >= 0; i--) {
       TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
-      FacetLabel child = taxoReader.getPath(ordAndValue.ord);
-      labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
+      ordinals[i] = ordAndValue.ord;
+      values[i] = ordAndValue.value;
+    }
+
+    FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
+    for (int i = 0; i < labelValues.length; i++) {
+      labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
     }
 
     return new FacetResult(dim, path, totValue, labelValues, childCount);
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java
@@ -212,6 +212,21 @@ public int getOrdinal(String dim, String... path) throws IOException {
   /** Returns the path name of the category with the given ordinal. */
   public abstract FacetLabel getPath(int ordinal) throws IOException;
 
+  /**
+   * Returns the path names of the list of ordinals associated with different categories.
+   *
+   * <p>The implementation in {@link
+   * org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader} is generally faster than
+   * the default implementation which iteratively calls {@link #getPath(int)}
+   */
+  public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
+    FacetLabel[] facetLabels = new FacetLabel[ordinals.length];
+    for (int i = 0; i < ordinals.length; i++) {
+      facetLabels[i] = getPath(ordinals[i]);
+    }
+    return facetLabels;
+  }
+
   /** Returns the current refCount for this taxonomy reader. */
   public final int getRefCount() {
     return refCount.get();
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@@ -18,10 +18,12 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.function.IntUnaryOperator;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import org.apache.lucene.document.Document;
@@ -35,6 +37,7 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.ReaderUtil;
@@ -44,6 +47,7 @@
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InPlaceMergeSorter;
 import org.apache.lucene.util.RamUsageEstimator;
 
 /**
@@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
     // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
     // instance recognizes. Therefore we do this check up front, before we hit
     // the cache.
-    if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
-      return null;
-    }
+    checkOrdinalBounds(ordinal);
 
-    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
-    // wrapped as LRU?
-    Integer catIDInteger = Integer.valueOf(ordinal);
-    synchronized (categoryCache) {
-      FacetLabel res = categoryCache.get(catIDInteger);
-      if (res != null) {
-        return res;
-      }
+    FacetLabel[] ordinalPath = getPathFromCache(ordinal);
+
+    if (ordinalPath[0] != null) {
+      return ordinalPath[0];
     }
 
     int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
     LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
-    // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
     BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
 
     FacetLabel ret;
@@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException {
     }
 
     synchronized (categoryCache) {
-      categoryCache.put(catIDInteger, ret);
+      categoryCache.put(ordinal, ret);
     }
 
     return ret;
   }
 
+  private FacetLabel[] getPathFromCache(int... ordinals) {
+    FacetLabel[] facetLabels = new FacetLabel[ordinals.length];
+    // TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap,
+    // wrapped as LRU?
+    synchronized (categoryCache) {
+      for (int i = 0; i < ordinals.length; i++) {
+        facetLabels[i] = categoryCache.get(ordinals[i]);
+      }
+    }
+    return facetLabels;
+  }
+
+  /**
+   * Checks if the ordinals in the array are >=0 and < {@code
+   * DirectoryTaxonomyReader#indexReader.maxDoc()}
+   *
+   * @param ordinals Integer array of ordinals
+   * @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is
+   *     out of bounds
+   */
+  private void checkOrdinalBounds(int... ordinals) throws IllegalArgumentException {
+    for (int ordinal : ordinals) {
+      if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
+        throw new IllegalArgumentException(
+            "ordinal "
+                + ordinal
+                + " is out of the range of the indexReader "
+                + indexReader.toString()
+                + ". The maximum possible ordinal number is "
+                + (indexReader.maxDoc() - 1));
+      }
+    }
+  }
+
+  /**
+   * Returns an array of FacetLabels for a given array of ordinals.
+   *
+   * <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
+   * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
+   * was created using StoredFields (with no performance gains) and uses DocValues based iteration
+   * when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
+   *
+   * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
+   *     index
+   */
+  @Override
+  public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
+    ensureOpen();
+    checkOrdinalBounds(ordinals);
+
+    int ordinalsLength = ordinals.length;
+    FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
+    // remember the original positions of ordinals before they are sorted
+    int[] originalPosition = new int[ordinalsLength];
+    Arrays.setAll(originalPosition, IntUnaryOperator.identity());
+
+    getPathFromCache(ordinals);
+
+    /* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
+    new InPlaceMergeSorter() {
+      @Override
+      protected void swap(int i, int j) {
+        int x = ordinals[i];
+        ordinals[i] = ordinals[j];
+        ordinals[j] = x;
+
+        x = originalPosition[i];
+        originalPosition[i] = originalPosition[j];
+        originalPosition[j] = x;
+      }
+
+      @Override
+      public int compare(int i, int j) {
+        return Integer.compare(ordinals[i], ordinals[j]);
+      }
+    }.sort(0, ordinalsLength);
+
+    int readerIndex;
+    int leafReaderMaxDoc = 0;
+    int leafReaderDocBase = 0;
+    LeafReader leafReader;
+    LeafReaderContext leafReaderContext;
+    BinaryDocValues values = null;
+    List<Integer> uncachedOrdinalPositions = new ArrayList<>();
+
+    for (int i = 0; i < ordinalsLength; i++) {
+      if (bulkPath[originalPosition[i]] == null) {
+        /*
+        If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
+        Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
+        (which is the maxDoc of the specific leaf)
+         */
+        if (values == null || ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc) {
+
+          readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
+          leafReaderContext = indexReader.leaves().get(readerIndex);
+          leafReader = leafReaderContext.reader();
+          leafReaderMaxDoc = leafReader.maxDoc();
+          leafReaderDocBase = leafReaderContext.docBase;
+          values = leafReader.getBinaryDocValues(Consts.FULL);
+
+          /*
+          If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
+           */
+          if (values == null) {
+            return super.getBulkPath(ordinals);
+          }
+        }
+        // values is leaf specific so you only advance till you reach the target within the leaf
+        boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
+        assert success;
+        bulkPath[originalPosition[i]] =
+            new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
+
+        uncachedOrdinalPositions.add(i);
+      }
+    }
+
+    if (uncachedOrdinalPositions.isEmpty() == false) {
+      synchronized (categoryCache) {
+        for (int i : uncachedOrdinalPositions) {
+          // add the value to the categoryCache after computation
+          categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
+        }
+      }
+    }
+
+    return bulkPath;
+  }
+
   @Override
   public int getSize() {
     ensureOpen();
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java
@@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
       }
     }
     //  (also test invalid ordinals:)
-    assertNull(tr.getPath(-1));
-    assertNull(tr.getPath(tr.getSize()));
-    assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));
 
     // test TaxonomyReader.getOrdinal():
     for (int i = 1; i < expectedCategories.length; i++) {
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
@@ -89,6 +89,28 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
     dir.close();
   }
 
+  // Opens up a pre-existing index and tries to run getBulkPath on it
+  public void testGetBulkPathOnOlderCodec() throws Exception {
+    Path indexDir = createTempDir(oldTaxonomyIndexName);
+    TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
+    Directory dir = newFSDirectory(indexDir);
+
+    DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
+    DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);
+
+    FacetLabel[] facetLabels = {
+      new FacetLabel("a"), new FacetLabel("b"),
+    };
+    int[] ordinals =
+        new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
+    // The zip file already contains a category "a" and a category "b" stored with the older
+    // StoredFields codec
+    assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
   // Used to create a fresh taxonomy index with StoredFields
   @Ignore
   public void testCreateOldTaxonomy() throws IOException {
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java

Original file line number	Diff line number	Diff line change
`@@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {`
`366`	`366`	`}`
`367`	`367`	`}`
`368`	`368`	`// (also test invalid ordinals:)`
`369`		`- assertNull(tr.getPath(-1));`
`370`		`- assertNull(tr.getPath(tr.getSize()));`
`371`		`- assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));`
	`369`	`+ expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));`
	`370`	`+ expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));`
	`371`	`+ expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));`
`372`	`372`
`373`	`373`	`// test TaxonomyReader.getOrdinal():`
`374`	`374`	`for (int i = 1; i < expectedCategories.length; i++) {`