Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ New Features

* LUCENE-9004: Approximate nearest vector search via NSW graphs

* LUCENE-9476: DirectoryTaxonomyReader now provides a getBulkPath API (Gautam Worah)

System Requirements

* LUCENE-8738: Move to Java 11 as minimum Java version.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
*/
package org.apache.lucene.facet.taxonomy.directory;

import com.carrotsearch.hppc.IntIntScatterMap;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -320,18 +322,12 @@ public FacetLabel getPath(int ordinal) throws IOException {
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
return null;
}
int indexReaderMaxDoc = indexReader.maxDoc();
isOrdinalInIndexReaderRange(ordinal, indexReaderMaxDoc);

// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(ordinal);
synchronized (categoryCache) {
FacetLabel res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
FacetLabel ordinalPath = getPathFromCache(ordinal);
if (ordinalPath != null) {
return ordinalPath;
}

int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
Expand All @@ -353,12 +349,104 @@ public FacetLabel getPath(int ordinal) throws IOException {
}

synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
categoryCache.put(ordinal, ret);
}

return ret;
}

private FacetLabel getPathFromCache(int ordinal) {
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooh that is a great idea, and low-hanging fruit, and would greatly reduce the RAM usage for this cache.

I think DirectoryTaxonomyWriter also has such a cache that we could change to a native map.

Could you open a spinoff issue?

// wrapped as LRU?
synchronized (categoryCache) {
return categoryCache.get(ordinal);
}
}

private void isOrdinalInIndexReaderRange(int ordinal, int indexReaderMaxDoc)
throws IllegalArgumentException {
if (ordinal < 0 || ordinal >= indexReaderMaxDoc) {
throw new IllegalArgumentException(
"ordinal "
+ ordinal
+ " is out of the range of the indexReader "
+ indexReader.toString());
}
}

/**
* Returns an array of FacetLabels for a given array of ordinals.
*
* <p>This API is generally faster than iteratively calling {@link
* org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader#getPath} over an array of
* ordinals.
*
* <p>This API is only available for Lucene indexes created with 8.7+ codec because it uses
* BinaryDocValues instead of StoredFields. Use the {@link
* org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader#getPath} method for indices
* created with codec version older than 8.7.
*
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
* index
* @throws IOException if the taxonomy index is created using the older StoredFields based codec.
*/
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
ensureOpen();

FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
// remember the original positions of ordinals before they are sorted
IntIntScatterMap originalPosition = new IntIntScatterMap();
int indexReaderMaxDoc = indexReader.maxDoc();
for (int i = 0; i < ordinals.length; i++) {
// check whether the ordinal is valid before accessing the cache
isOrdinalInIndexReaderRange(ordinals[i], indexReaderMaxDoc);
// check the cache before trying to find it in the index
FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
if (ordinalPath != null) {
bulkPath[i] = ordinalPath;
}
originalPosition.put(ordinals[i], i);
}

Arrays.sort(ordinals);
int readerIndex;
int leafReaderMaxDoc = 0;
int leafReaderDocBase = 0;
LeafReader leafReader;
LeafReaderContext leafReaderContext = null;
BinaryDocValues values = null;

for (int ord : ordinals) {
if (bulkPath[originalPosition.get(ord)] == null) {
if (values == null || ord > leafReaderMaxDoc) {

readerIndex = ReaderUtil.subIndex(ord, indexReader.leaves());
leafReaderContext = indexReader.leaves().get(readerIndex);
leafReader = leafReaderContext.reader();
leafReaderMaxDoc = leafReader.maxDoc();
leafReaderDocBase = leafReaderContext.docBase;
values = leafReader.getBinaryDocValues(Consts.FULL);

// this check is only needed once to confirm that the index uses BinaryDocValues
boolean success = values.advanceExact(ord - leafReaderDocBase);
if (success == false) {
throw new IOException(
"the taxonomy index is created using the older StoredFields format which uses a Lucene "
+ "codec older than 8.7. Use the getPath(int ordinal) API iteratively instead.");
}
}
values.advanceExact(ord - leafReaderDocBase);
bulkPath[originalPosition.get(ord)] =
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
synchronized (categoryCache) {
categoryCache.put(ord, bulkPath[originalPosition.get(ord)]);
}
}
}

return bulkPath;
}

@Override
public int getSize() {
ensureOpen();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
}
}
// (also test invalid ordinals:)
assertNull(tr.getPath(-1));
assertNull(tr.getPath(tr.getSize()));
assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));

// test TaxonomyReader.getOrdinal():
for (int i = 1; i < expectedCategories.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,29 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
dir.close();
}

// Opens up a pre-existing index and tries to run getBulkPath on it
public void testBulkPathFailsOnOlderCodec() throws Exception {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);

DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);

FacetLabel cp_b = new FacetLabel("b");
writer.addCategory(cp_b);
writer.getInternalIndexWriter().forceMerge(1);
writer.commit();

DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);

int[] ordinals = new int[] {reader.getOrdinal(new FacetLabel("a")), reader.getOrdinal(cp_b)};
// The zip file already contains a category "a" stored with the older StoredFields codec
expectThrows(IOException.class, () -> reader.getBulkPath(ordinals));
reader.close();
writer.close();
dir.close();
}

// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {

// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertNull(r1.getPath(2));
expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));

r1.close();
r2.close();
Expand Down Expand Up @@ -569,4 +569,32 @@ public void testAccountable() throws Exception {
taxoReader.close();
dir.close();
}

public void testCallingBulkPathReturnsCorrectResult() throws Exception {
Directory src = newDirectory();
DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
String randomArray[] = new String[random().nextInt(1000)];
Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt()));

FacetLabel allPaths[] = new FacetLabel[randomArray.length];
int allOrdinals[] = new int[randomArray.length];

for (int i = 0; i < randomArray.length; i++) {
allPaths[i] = new FacetLabel(randomArray[i]);
w.addCategory(allPaths[i]);
}
w.commit();
w.close();

DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);

for (int i = 0; i < allPaths.length; i++) {
allOrdinals[i] = r1.getOrdinal(allPaths[i]);
}

FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
assertArrayEquals(allPaths, allBulkPaths);
r1.close();
src.close();
}
}