Skip to content

LUCENE-9476 Add getBulkPath API for the Taxonomy index #2247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ New Features

* LUCENE-9004: Approximate nearest vector search via NSW graphs

* LUCENE-9476: DirectoryTaxonomyReader now provides a getBulkPath API (Gautam Worah)

System Requirements

* LUCENE-8738: Move to Java 11 as minimum Java version.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
*/
package org.apache.lucene.facet.taxonomy.directory;

import com.carrotsearch.hppc.IntIntScatterMap;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -320,18 +322,12 @@ public FacetLabel getPath(int ordinal) throws IOException {
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
return null;
}
int indexReaderMaxDoc = indexReader.maxDoc();
isOrdinalInIndexReaderRange(ordinal, indexReaderMaxDoc);

// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(ordinal);
synchronized (categoryCache) {
FacetLabel res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
FacetLabel ordinalPath = getPathFromCache(ordinal);
if (ordinalPath != null) {
return ordinalPath;
}

int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
Expand All @@ -353,12 +349,104 @@ public FacetLabel getPath(int ordinal) throws IOException {
}

synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
categoryCache.put(ordinal, ret);
}

return ret;
}

private FacetLabel getPathFromCache(int ordinal) {
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooh that is a great idea, and low-hanging fruit, and would greatly reduce the RAM usage for this cache.

I think DirectoryTaxonomyWriter also has such a cache that we could change to a native map.

Could you open a spinoff issue?

// wrapped as LRU?
synchronized (categoryCache) {
return categoryCache.get(ordinal);
}
}

private void isOrdinalInIndexReaderRange(int ordinal, int indexReaderMaxDoc)
throws IllegalArgumentException {
if (ordinal < 0 || ordinal >= indexReaderMaxDoc) {
throw new IllegalArgumentException(
"ordinal "
+ ordinal
+ " is out of the range of the indexReader "
+ indexReader.toString());
}
}

/**
* Returns an array of FacetLabels for a given array of ordinals.
*
* <p>This API is generally faster than iteratively calling {@link
* org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader#getPath} over an array of
* ordinals.
*
* <p>This API is only available for Lucene indexes created with 8.7+ codec because it uses
* BinaryDocValues instead of StoredFields. Use the {@link
* org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader#getPath} method for indices
* created with codec version older than 8.7.
*
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
* index
* @throws IOException if the taxonomy index is created using the older StoredFields based codec.
*/
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
ensureOpen();

FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
// remember the original positions of ordinals before they are sorted
IntIntScatterMap originalPosition = new IntIntScatterMap();
int indexReaderMaxDoc = indexReader.maxDoc();
for (int i = 0; i < ordinals.length; i++) {
// check whether the ordinal is valid before accessing the cache
isOrdinalInIndexReaderRange(ordinals[i], indexReaderMaxDoc);
// check the cache before trying to find it in the index
FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
if (ordinalPath != null) {
bulkPath[i] = ordinalPath;
}
originalPosition.put(ordinals[i], i);
}

Arrays.sort(ordinals);
int readerIndex;
int leafReaderMaxDoc = 0;
int leafReaderDocBase = 0;
LeafReader leafReader;
LeafReaderContext leafReaderContext = null;
BinaryDocValues values = null;

for (int ord : ordinals) {
if (bulkPath[originalPosition.get(ord)] == null) {
if (values == null || ord > leafReaderMaxDoc) {

readerIndex = ReaderUtil.subIndex(ord, indexReader.leaves());
leafReaderContext = indexReader.leaves().get(readerIndex);
leafReader = leafReaderContext.reader();
leafReaderMaxDoc = leafReader.maxDoc();
leafReaderDocBase = leafReaderContext.docBase;
values = leafReader.getBinaryDocValues(Consts.FULL);

// this check is only needed once to confirm that the index uses BinaryDocValues
boolean success = values.advanceExact(ord - leafReaderDocBase);
if (success == false) {
throw new IOException(
"the taxonomy index is created using the older StoredFields format which uses a Lucene "
+ "codec older than 8.7. Use the getPath(int ordinal) API iteratively instead.");
}
}
values.advanceExact(ord - leafReaderDocBase);
bulkPath[originalPosition.get(ord)] =
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
synchronized (categoryCache) {
categoryCache.put(ord, bulkPath[originalPosition.get(ord)]);
}
}
}

return bulkPath;
}

@Override
public int getSize() {
ensureOpen();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
}
}
// (also test invalid ordinals:)
assertNull(tr.getPath(-1));
assertNull(tr.getPath(tr.getSize()));
assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));

// test TaxonomyReader.getOrdinal():
for (int i = 1; i < expectedCategories.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,29 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
dir.close();
}

// Opens up a pre-existing index and tries to run getBulkPath on it
public void testBulkPathFailsOnOlderCodec() throws Exception {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);

DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);

FacetLabel cp_b = new FacetLabel("b");
writer.addCategory(cp_b);
writer.getInternalIndexWriter().forceMerge(1);
writer.commit();

DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);

int[] ordinals = new int[] {reader.getOrdinal(new FacetLabel("a")), reader.getOrdinal(cp_b)};
// The zip file already contains a category "a" stored with the older StoredFields codec
expectThrows(IOException.class, () -> reader.getBulkPath(ordinals));
reader.close();
writer.close();
dir.close();
}

// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {

// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertNull(r1.getPath(2));
expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));

r1.close();
r2.close();
Expand Down Expand Up @@ -569,4 +569,32 @@ public void testAccountable() throws Exception {
taxoReader.close();
dir.close();
}

public void testCallingBulkPathReturnsCorrectResult() throws Exception {
Directory src = newDirectory();
DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
String randomArray[] = new String[random().nextInt(1000)];
Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt()));

FacetLabel allPaths[] = new FacetLabel[randomArray.length];
int allOrdinals[] = new int[randomArray.length];

for (int i = 0; i < randomArray.length; i++) {
allPaths[i] = new FacetLabel(randomArray[i]);
w.addCategory(allPaths[i]);
}
w.commit();
w.close();

DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);

for (int i = 0; i < allPaths.length; i++) {
allOrdinals[i] = r1.getOrdinal(allPaths[i]);
}

FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
assertArrayEquals(allPaths, allBulkPaths);
r1.close();
src.close();
}
}