apache · gautamworah96 · Jan 26, 2021 · Jan 26, 2021 · Jan 29, 2021 · Feb 8, 2021
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -11,6 +11,8 @@ New Features
 
 * LUCENE-9004: Approximate nearest vector search via NSW graphs
 
+* LUCENE-9476: DirectoryTaxonomyReader now provides a getBulkPath API (Gautam Worah)
+
 System Requirements
 
 * LUCENE-8738: Move to Java 11 as minimum Java version.

diff --git a/...ne/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/...ne/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@@ -16,8 +16,10 @@
  */
 package org.apache.lucene.facet.taxonomy.directory;
 
+import com.carrotsearch.hppc.IntIntScatterMap;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
@@ -320,18 +322,12 @@ public FacetLabel getPath(int ordinal) throws IOException {
     // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
     // instance recognizes. Therefore we do this check up front, before we hit
     // the cache.
-    if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
-      return null;
-    }
+    int indexReaderMaxDoc = indexReader.maxDoc();
+    isOrdinalInIndexReaderRange(ordinal, indexReaderMaxDoc);
 
-    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
-    // wrapped as LRU?
-    Integer catIDInteger = Integer.valueOf(ordinal);
-    synchronized (categoryCache) {
-      FacetLabel res = categoryCache.get(catIDInteger);
-      if (res != null) {
-        return res;
-      }
+    FacetLabel ordinalPath = getPathFromCache(ordinal);
+    if (ordinalPath != null) {
+      return ordinalPath;
     }
 
     int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
@@ -353,12 +349,104 @@ public FacetLabel getPath(int ordinal) throws IOException {
     }
 
     synchronized (categoryCache) {
-      categoryCache.put(catIDInteger, ret);
+      categoryCache.put(ordinal, ret);
     }
 
     return ret;
   }
 
+  private FacetLabel getPathFromCache(int ordinal) {
+    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
+    // wrapped as LRU?
+    synchronized (categoryCache) {
+      return categoryCache.get(ordinal);
+    }
+  }
+
+  private void isOrdinalInIndexReaderRange(int ordinal, int indexReaderMaxDoc)
+      throws IllegalArgumentException {
+    if (ordinal < 0 || ordinal >= indexReaderMaxDoc) {
+      throw new IllegalArgumentException(
+          "ordinal "
+              + ordinal
+              + " is out of the range of the indexReader "
+              + indexReader.toString());
+    }
+  }
+
+  /**
+   * Returns an array of FacetLabels for a given array of ordinals.
+   *
+   * <p>This API is generally faster than iteratively calling {@link
+   * org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader#getPath} over an array of
+   * ordinals.
+   *
+   * <p>This API is only available for Lucene indexes created with 8.7+ codec because it uses
+   * BinaryDocValues instead of StoredFields. Use the {@link
+   * org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader#getPath} method for indices
+   * created with codec version older than 8.7.
+   *
+   * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
+   *     index
+   * @throws IOException if the taxonomy index is created using the older StoredFields based codec.
+   */
+  public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
+    ensureOpen();
+
+    FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
+    // remember the original positions of ordinals before they are sorted
+    IntIntScatterMap originalPosition = new IntIntScatterMap();
+    int indexReaderMaxDoc = indexReader.maxDoc();
+    for (int i = 0; i < ordinals.length; i++) {
+      // check whether the ordinal is valid before accessing the cache
+      isOrdinalInIndexReaderRange(ordinals[i], indexReaderMaxDoc);
+      // check the cache before trying to find it in the index
+      FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
+      if (ordinalPath != null) {
+        bulkPath[i] = ordinalPath;
+      }
+      originalPosition.put(ordinals[i], i);
+    }
+
+    Arrays.sort(ordinals);
+    int readerIndex;
+    int leafReaderMaxDoc = 0;
+    int leafReaderDocBase = 0;
+    LeafReader leafReader;
+    LeafReaderContext leafReaderContext = null;
+    BinaryDocValues values = null;
+
+    for (int ord : ordinals) {
+      if (bulkPath[originalPosition.get(ord)] == null) {
+        if (values == null || ord > leafReaderMaxDoc) {
+
+          readerIndex = ReaderUtil.subIndex(ord, indexReader.leaves());
+          leafReaderContext = indexReader.leaves().get(readerIndex);
+          leafReader = leafReaderContext.reader();
+          leafReaderMaxDoc = leafReader.maxDoc();
+          leafReaderDocBase = leafReaderContext.docBase;
+          values = leafReader.getBinaryDocValues(Consts.FULL);
+
+          // this check is only needed once to confirm that the index uses BinaryDocValues
+          boolean success = values.advanceExact(ord - leafReaderDocBase);
+          if (success == false) {
+            throw new IOException(
+                "the taxonomy index is created using the older StoredFields format which uses a Lucene "
+                    + "codec older than 8.7. Use the getPath(int ordinal) API iteratively instead.");
+          }
+        }
+        values.advanceExact(ord - leafReaderDocBase);
+        bulkPath[originalPosition.get(ord)] =
+            new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
+        synchronized (categoryCache) {
+          categoryCache.put(ord, bulkPath[originalPosition.get(ord)]);
+        }
+      }
+    }
+
+    return bulkPath;
+  }
+
   @Override
   public int getSize() {
     ensureOpen();

diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java
@@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
       }
     }
     //  (also test invalid ordinals:)
-    assertNull(tr.getPath(-1));
-    assertNull(tr.getPath(tr.getSize()));
-    assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));
 
     // test TaxonomyReader.getOrdinal():
     for (int i = 1; i < expectedCategories.length; i++) {

diff --git a/...facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/...facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
@@ -84,6 +84,29 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
     dir.close();
   }
 
+  // Opens up a pre-existing index and tries to run getBulkPath on it
+  public void testBulkPathFailsOnOlderCodec() throws Exception {
+    Path indexDir = createTempDir(oldTaxonomyIndexName);
+    TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
+    Directory dir = newFSDirectory(indexDir);
+
+    DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
+
+    FacetLabel cp_b = new FacetLabel("b");
+    writer.addCategory(cp_b);
+    writer.getInternalIndexWriter().forceMerge(1);
+    writer.commit();
+
+    DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);
+
+    int[] ordinals = new int[] {reader.getOrdinal(new FacetLabel("a")), reader.getOrdinal(cp_b)};
+    // The zip file already contains a category "a" stored with the older StoredFields codec
+    expectThrows(IOException.class, () -> reader.getBulkPath(ordinals));
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
   // Used to create a fresh taxonomy index with StoredFields
   @Ignore
   public void testCreateOldTaxonomy() throws IOException {

diff --git a/...acet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java b/...acet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
@@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {
 
       // check that r1 doesn't see cp_b
       assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
-      assertNull(r1.getPath(2));
+      expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));
 
       r1.close();
       r2.close();
@@ -569,4 +569,32 @@ public void testAccountable() throws Exception {
     taxoReader.close();
     dir.close();
   }
+
+  public void testCallingBulkPathReturnsCorrectResult() throws Exception {
+    Directory src = newDirectory();
+    DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
+    String randomArray[] = new String[random().nextInt(1000)];
+    Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt()));
+
+    FacetLabel allPaths[] = new FacetLabel[randomArray.length];
+    int allOrdinals[] = new int[randomArray.length];
+
+    for (int i = 0; i < randomArray.length; i++) {
+      allPaths[i] = new FacetLabel(randomArray[i]);
+      w.addCategory(allPaths[i]);
+    }
+    w.commit();
+    w.close();
+
+    DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);
+
+    for (int i = 0; i < allPaths.length; i++) {
+      allOrdinals[i] = r1.getOrdinal(allPaths[i]);
+    }
+
+    FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
+    assertArrayEquals(allPaths, allBulkPaths);
+    r1.close();
+    src.close();
+  }
 }