Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ API Changes

Improvements

* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
facet ordinals at once (Gautam Worah, Mike McCandless)

* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)

* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndFloatQueue;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;

/** Base class for all taxonomy-based facets that aggregate to a per-ords float[]. */
public abstract class FloatTaxonomyFacets extends TaxonomyFacets {
Expand Down Expand Up @@ -146,10 +147,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
}

LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
float[] values = new float[labelValues.length];

for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop();
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}

FacetLabel[] bulkPath = ((DirectoryTaxonomyReader) taxoReader).getBulkPath(ordinals);
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
}

return new FacetResult(dim, path, sumValues, labelValues, childCount);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;

/** Base class for all taxonomy-based facets that aggregate to a per-ords int[]. */
public abstract class IntTaxonomyFacets extends TaxonomyFacets {
Expand Down Expand Up @@ -237,10 +238,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
}

LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
int[] values = new int[labelValues.length];

for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}

FacetLabel[] bulkPath = ((DirectoryTaxonomyReader) taxoReader).getBulkPath(ordinals);
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
}

return new FacetResult(dim, path, totValue, labelValues, childCount);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.IntUnaryOperator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
Expand All @@ -35,6 +37,7 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
Expand All @@ -44,6 +47,7 @@
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;

/**
Expand Down Expand Up @@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
return null;
}
checkOrdinalBounds(ordinal);

// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(ordinal);
synchronized (categoryCache) {
FacetLabel res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
FacetLabel ordinalPath = getPathFromCache(ordinal);

if (ordinalPath != null) {
return ordinalPath;
}

int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);

FacetLabel ret;
Expand All @@ -351,12 +348,140 @@ public FacetLabel getPath(int ordinal) throws IOException {
}

synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
categoryCache.put(ordinal, ret);
}

return ret;
}

private FacetLabel getPathFromCache(int ordinal) {
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
synchronized (categoryCache) {
return categoryCache.get(ordinal);
}
}

private void checkOrdinalBounds(int ordinal) throws IllegalArgumentException {
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
throw new IllegalArgumentException(
"ordinal "
+ ordinal
+ " is out of the range of the indexReader "
+ indexReader.toString()
+ ". The maximum possible ordinal number is "
+ (indexReader.maxDoc() - 1));
}
}

/**
* Returns an array of FacetLabels for a given array of ordinals.
*
* <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
* ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
* was created using StoredFields (with no performance gains) and uses DocValues based iteration
* when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
*
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
* index
*/
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
ensureOpen();

int ordinalsLength = ordinals.length;
FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
// remember the original positions of ordinals before they are sorted
int[] originalPosition = new int[ordinalsLength];
Arrays.setAll(originalPosition, IntUnaryOperator.identity());

for (int i = 0; i < ordinalsLength; i++) {
// check whether the ordinal is valid before accessing the cache
checkOrdinalBounds(ordinals[i]);
// check the cache before trying to find it in the index
FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
if (ordinalPath != null) {
bulkPath[i] = ordinalPath;
}
}

/* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int x = ordinals[i];
ordinals[i] = ordinals[j];
ordinals[j] = x;

x = originalPosition[i];
originalPosition[i] = originalPosition[j];
originalPosition[j] = x;
}

@Override
public int compare(int i, int j) {
return Integer.compare(ordinals[i], ordinals[j]);
}
}.sort(0, ordinalsLength);

int readerIndex;
int leafReaderMaxDoc = 0;
int leafReaderDocBase = 0;
LeafReader leafReader;
LeafReaderContext leafReaderContext;
BinaryDocValues values = null;

for (int i = 0; i < ordinalsLength; i++) {
if (bulkPath[originalPosition[i]] == null) {
/*
If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
(which is the maxDoc of the specific leaf)
*/
if (values == null || ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc) {

readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
leafReaderContext = indexReader.leaves().get(readerIndex);
leafReader = leafReaderContext.reader();
leafReaderMaxDoc = leafReader.maxDoc();
leafReaderDocBase = leafReaderContext.docBase;
values = leafReader.getBinaryDocValues(Consts.FULL);

/*
If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
*/
if (values == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's sort of weird to do this check per-segment, since we can know up front, based on indexed created version, whether it's stored fields or doc values? I.e. this is not a segment by segment decision. But let's do it in follow-on issue. I think this one is ready!

Copy link
Contributor Author

@gautamworah96 gautamworah96 Aug 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I'll create it after this PR is merged. Otherwise, it just feels weird to create an issue for a fix in a piece of code that has not been merged :D

return getBulkPathForOlderIndexes(ordinals);
}
}
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
assert success;
bulkPath[originalPosition[i]] =
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));

// add the value to the categoryCache after computation
synchronized (categoryCache) {
categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
}
}
}

return bulkPath;
}

/**
* This function is only used when the underlying taxonomy index was constructed using an older
* (slower) StoredFields based codec (< 8.7). The {@link #getBulkPath(int...)} function calls it
* internally when it realizes that the index uses StoredFields.
*/
private FacetLabel[] getBulkPathForOlderIndexes(int... ordinals) throws IOException {
FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
for (int i = 0; i < ordinals.length; i++) {
bulkPath[i] = getPath(ordinals[i]);
}

return bulkPath;
}

@Override
public int getSize() {
ensureOpen();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
}
}
// (also test invalid ordinals:)
assertNull(tr.getPath(-1));
assertNull(tr.getPath(tr.getSize()));
assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));

// test TaxonomyReader.getOrdinal():
for (int i = 1; i < expectedCategories.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,28 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
dir.close();
}

// Opens up a pre-existing index and tries to run getBulkPath on it
public void testGetBulkPathOnOlderCodec() throws Exception {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);

DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);

FacetLabel[] facetLabels = {
new FacetLabel("a"), new FacetLabel("b"),
};
int[] ordinals =
new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
// The zip file already contains a category "a" and a category "b" stored with the older
// StoredFields codec
assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
reader.close();
writer.close();
dir.close();
}

// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {

// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertNull(r1.getPath(2));
expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));

r1.close();
r2.close();
Expand Down Expand Up @@ -567,4 +567,39 @@ public void testAccountable() throws Exception {
taxoReader.close();
dir.close();
}

public void testCallingBulkPathReturnsCorrectResult() throws Exception {
float PROBABILITY_OF_COMMIT = 0.5f;
Directory src = newDirectory();
DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
String randomArray[] = new String[random().nextInt(1000)];
// adding a smaller bound on ints ensures that we will have some duplicate ordinals in random
// test cases
Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt(500)));

FacetLabel allPaths[] = new FacetLabel[randomArray.length];
int allOrdinals[] = new int[randomArray.length];

for (int i = 0; i < randomArray.length; i++) {
allPaths[i] = new FacetLabel(randomArray[i]);
w.addCategory(allPaths[i]);
// add random commits to create multiple segments in the index
if (random().nextFloat() < PROBABILITY_OF_COMMIT) {
w.commit();
}
}
w.commit();
w.close();

DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);

for (int i = 0; i < allPaths.length; i++) {
allOrdinals[i] = r1.getOrdinal(allPaths[i]);
}

FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
assertArrayEquals(allPaths, allBulkPaths);
r1.close();
src.close();
}
}