apache · mikemccand · Sep 2, 2021 · Jun 7, 2021 · Jun 9, 2021 · Jun 9, 2021
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -137,6 +137,9 @@ API Changes
 
 Improvements
 
+* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
+  facet ordinals at once (Gautam Worah, Mike McCandless)
+
 * LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)
 
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).

diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java
@@ -23,6 +23,7 @@
 import org.apache.lucene.facet.FacetsConfig.DimConfig;
 import org.apache.lucene.facet.LabelAndValue;
 import org.apache.lucene.facet.TopOrdAndFloatQueue;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
 
 /** Base class for all taxonomy-based facets that aggregate to a per-ords float[]. */
 public abstract class FloatTaxonomyFacets extends TaxonomyFacets {
@@ -146,10 +147,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
     }
 
     LabelAndValue[] labelValues = new LabelAndValue[q.size()];
+    int[] ordinals = new int[labelValues.length];
+    float[] values = new float[labelValues.length];
+
     for (int i = labelValues.length - 1; i >= 0; i--) {
       TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop();
-      FacetLabel child = taxoReader.getPath(ordAndValue.ord);
-      labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
+      ordinals[i] = ordAndValue.ord;
+      values[i] = ordAndValue.value;
+    }
+
+    FacetLabel[] bulkPath = ((DirectoryTaxonomyReader) taxoReader).getBulkPath(ordinals);
+    for (int i = 0; i < labelValues.length; i++) {
+      labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
     }
 
     return new FacetResult(dim, path, sumValues, labelValues, childCount);

diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java
@@ -27,6 +27,7 @@
 import org.apache.lucene.facet.FacetsConfig.DimConfig;
 import org.apache.lucene.facet.LabelAndValue;
 import org.apache.lucene.facet.TopOrdAndIntQueue;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
 
 /** Base class for all taxonomy-based facets that aggregate to a per-ords int[]. */
 public abstract class IntTaxonomyFacets extends TaxonomyFacets {
@@ -237,10 +238,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
     }
 
     LabelAndValue[] labelValues = new LabelAndValue[q.size()];
+    int[] ordinals = new int[labelValues.length];
+    int[] values = new int[labelValues.length];
+
     for (int i = labelValues.length - 1; i >= 0; i--) {
       TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
-      FacetLabel child = taxoReader.getPath(ordAndValue.ord);
-      labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
+      ordinals[i] = ordAndValue.ord;
+      values[i] = ordAndValue.value;
+    }
+
+    FacetLabel[] bulkPath = ((DirectoryTaxonomyReader) taxoReader).getBulkPath(ordinals);
+    for (int i = 0; i < labelValues.length; i++) {
+      labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
     }
 
     return new FacetResult(dim, path, totValue, labelValues, childCount);

diff --git a/...ne/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/...ne/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@@ -18,10 +18,12 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.function.IntUnaryOperator;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import org.apache.lucene.document.Document;
@@ -35,6 +37,7 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.ReaderUtil;
@@ -44,6 +47,7 @@
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InPlaceMergeSorter;
 import org.apache.lucene.util.RamUsageEstimator;
 
 /**
@@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
     // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
     // instance recognizes. Therefore we do this check up front, before we hit
     // the cache.
-    if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
-      return null;
-    }
+    checkOrdinalBounds(ordinal);
 
-    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
-    // wrapped as LRU?
-    Integer catIDInteger = Integer.valueOf(ordinal);
-    synchronized (categoryCache) {
-      FacetLabel res = categoryCache.get(catIDInteger);
-      if (res != null) {
-        return res;
-      }
+    FacetLabel ordinalPath = getPathFromCache(ordinal);
+
+    if (ordinalPath != null) {
+      return ordinalPath;
     }
 
     int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
     LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
-    // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
     BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
 
     FacetLabel ret;
@@ -351,12 +348,140 @@ public FacetLabel getPath(int ordinal) throws IOException {
     }
 
     synchronized (categoryCache) {
-      categoryCache.put(catIDInteger, ret);
+      categoryCache.put(ordinal, ret);
     }
 
     return ret;
   }
 
+  private FacetLabel getPathFromCache(int ordinal) {
+    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
+    // wrapped as LRU?
+    synchronized (categoryCache) {
+      return categoryCache.get(ordinal);
+    }
+  }
+
+  private void checkOrdinalBounds(int ordinal) throws IllegalArgumentException {
+    if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
+      throw new IllegalArgumentException(
+          "ordinal "
+              + ordinal
+              + " is out of the range of the indexReader "
+              + indexReader.toString()
+              + ". The maximum possible ordinal number is "
+              + (indexReader.maxDoc() - 1));
+    }
+  }
+
+  /**
+   * Returns an array of FacetLabels for a given array of ordinals.
+   *
+   * <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
+   * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
+   * was created using StoredFields (with no performance gains) and uses DocValues based iteration
+   * when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
+   *
+   * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
+   *     index
+   */
+  public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
+    ensureOpen();
+
+    int ordinalsLength = ordinals.length;
+    FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
+    // remember the original positions of ordinals before they are sorted
+    int[] originalPosition = new int[ordinalsLength];
+    Arrays.setAll(originalPosition, IntUnaryOperator.identity());
+
+    for (int i = 0; i < ordinalsLength; i++) {
+      // check whether the ordinal is valid before accessing the cache
+      checkOrdinalBounds(ordinals[i]);
+      // check the cache before trying to find it in the index
+      FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
+      if (ordinalPath != null) {
+        bulkPath[i] = ordinalPath;
+      }
+    }
+
+    /* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
+    new InPlaceMergeSorter() {
+      @Override
+      protected void swap(int i, int j) {
+        int x = ordinals[i];
+        ordinals[i] = ordinals[j];
+        ordinals[j] = x;
+
+        x = originalPosition[i];
+        originalPosition[i] = originalPosition[j];
+        originalPosition[j] = x;
+      }
+
+      @Override
+      public int compare(int i, int j) {
+        return Integer.compare(ordinals[i], ordinals[j]);
+      }
+    }.sort(0, ordinalsLength);
+
+    int readerIndex;
+    int leafReaderMaxDoc = 0;
+    int leafReaderDocBase = 0;
+    LeafReader leafReader;
+    LeafReaderContext leafReaderContext;
+    BinaryDocValues values = null;
+
+    for (int i = 0; i < ordinalsLength; i++) {
+      if (bulkPath[originalPosition[i]] == null) {
+        /*
+        If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
+        Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
+        (which is the maxDoc of the specific leaf)
+         */
+        if (values == null || ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc) {
+
+          readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
+          leafReaderContext = indexReader.leaves().get(readerIndex);
+          leafReader = leafReaderContext.reader();
+          leafReaderMaxDoc = leafReader.maxDoc();
+          leafReaderDocBase = leafReaderContext.docBase;
+          values = leafReader.getBinaryDocValues(Consts.FULL);
+
+          /*
+          If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
+           */
+          if (values == null) {
+            return getBulkPathForOlderIndexes(ordinals);
+          }
+        }
+        boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
+        assert success;
+        bulkPath[originalPosition[i]] =
+            new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
+
+        // add the value to the categoryCache after computation
+        synchronized (categoryCache) {
+          categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
+        }
+      }
+    }
+
+    return bulkPath;
+  }
+
+  /**
+   * This function is only used when the underlying taxonomy index was constructed using an older
+   * (slower) StoredFields based codec (< 8.7). The {@link #getBulkPath(int...)} function calls it
+   * internally when it realizes that the index uses StoredFields.
+   */
+  private FacetLabel[] getBulkPathForOlderIndexes(int... ordinals) throws IOException {
+    FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
+    for (int i = 0; i < ordinals.length; i++) {
+      bulkPath[i] = getPath(ordinals[i]);
+    }
+
+    return bulkPath;
+  }
+
   @Override
   public int getSize() {
     ensureOpen();

diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java
@@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
       }
     }
     //  (also test invalid ordinals:)
-    assertNull(tr.getPath(-1));
-    assertNull(tr.getPath(tr.getSize()));
-    assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));
 
     // test TaxonomyReader.getOrdinal():
     for (int i = 1; i < expectedCategories.length; i++) {

diff --git a/...facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/...facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
@@ -89,6 +89,28 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
     dir.close();
   }
 
+  // Opens up a pre-existing index and tries to run getBulkPath on it
+  public void testGetBulkPathOnOlderCodec() throws Exception {
+    Path indexDir = createTempDir(oldTaxonomyIndexName);
+    TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
+    Directory dir = newFSDirectory(indexDir);
+
+    DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
+    DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);
+
+    FacetLabel[] facetLabels = {
+      new FacetLabel("a"), new FacetLabel("b"),
+    };
+    int[] ordinals =
+        new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
+    // The zip file already contains a category "a" and a category "b" stored with the older
+    // StoredFields codec
+    assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
   // Used to create a fresh taxonomy index with StoredFields
   @Ignore
   public void testCreateOldTaxonomy() throws IOException {

diff --git a/...acet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java b/...acet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
@@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {
 
       // check that r1 doesn't see cp_b
       assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
-      assertNull(r1.getPath(2));
+      expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));
 
       r1.close();
       r2.close();
@@ -567,4 +567,39 @@ public void testAccountable() throws Exception {
     taxoReader.close();
     dir.close();
   }
+
+  public void testCallingBulkPathReturnsCorrectResult() throws Exception {
+    float PROBABILITY_OF_COMMIT = 0.5f;
+    Directory src = newDirectory();
+    DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
+    String randomArray[] = new String[random().nextInt(1000)];
+    // adding a smaller bound on ints ensures that we will have some duplicate ordinals in random
+    // test cases
+    Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt(500)));
+
+    FacetLabel allPaths[] = new FacetLabel[randomArray.length];
+    int allOrdinals[] = new int[randomArray.length];
+
+    for (int i = 0; i < randomArray.length; i++) {
+      allPaths[i] = new FacetLabel(randomArray[i]);
+      w.addCategory(allPaths[i]);
+      // add random commits to create multiple segments in the index
+      if (random().nextFloat() < PROBABILITY_OF_COMMIT) {
+        w.commit();
+      }
+    }
+    w.commit();
+    w.close();
+
+    DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);
+
+    for (int i = 0; i < allPaths.length; i++) {
+      allOrdinals[i] = r1.getOrdinal(allPaths[i]);
+    }
+
+    FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
+    assertArrayEquals(allPaths, allBulkPaths);
+    r1.close();
+    src.close();
+  }
 }