-
Notifications
You must be signed in to change notification settings - Fork 2.6k
LUCENE-9476 Add getBulkPath API for the Taxonomy index #2247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
8a820f1
93bbe5b
fd73d7b
f8425e4
0c53c3b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,10 +18,12 @@ | |
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.IntUnaryOperator; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import org.apache.lucene.document.Document; | ||
|
@@ -31,7 +33,7 @@ | |
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; | ||
import org.apache.lucene.facet.taxonomy.TaxonomyReader; | ||
import org.apache.lucene.index.BinaryDocValues; | ||
import org.apache.lucene.index.CorruptIndexException; // javadocs | ||
import org.apache.lucene.index.CorruptIndexException; | ||
import org.apache.lucene.index.DirectoryReader; | ||
import org.apache.lucene.index.IndexWriter; | ||
import org.apache.lucene.index.LeafReader; | ||
|
@@ -46,6 +48,7 @@ | |
import org.apache.lucene.util.Accountables; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.apache.lucene.util.IOUtils; | ||
import org.apache.lucene.util.InPlaceMergeSorter; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
|
||
/** | ||
|
@@ -320,23 +323,16 @@ public FacetLabel getPath(int ordinal) throws IOException { | |
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR | ||
// instance recognizes. Therefore we do this check up front, before we hit | ||
// the cache. | ||
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { | ||
return null; | ||
} | ||
int indexReaderMaxDoc = indexReader.maxDoc(); | ||
checkOrdinalBounds(ordinal, indexReaderMaxDoc); | ||
|
||
// TODO: can we use an int-based hash impl, such as IntToObjectMap, | ||
// wrapped as LRU? | ||
Integer catIDInteger = Integer.valueOf(ordinal); | ||
synchronized (categoryCache) { | ||
FacetLabel res = categoryCache.get(catIDInteger); | ||
if (res != null) { | ||
return res; | ||
} | ||
FacetLabel ordinalPath = getPathFromCache(ordinal); | ||
gautamworah96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (ordinalPath != null) { | ||
return ordinalPath; | ||
} | ||
|
||
int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves()); | ||
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader(); | ||
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues | ||
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL); | ||
|
||
FacetLabel ret; | ||
|
@@ -353,12 +349,137 @@ public FacetLabel getPath(int ordinal) throws IOException { | |
} | ||
|
||
synchronized (categoryCache) { | ||
categoryCache.put(catIDInteger, ret); | ||
categoryCache.put(ordinal, ret); | ||
} | ||
|
||
return ret; | ||
} | ||
|
||
private FacetLabel getPathFromCache(int ordinal) { | ||
// TODO: can we use an int-based hash impl, such as IntToObjectMap, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oooh that is a great idea, and low-hanging fruit, and would greatly reduce the RAM usage for this cache. I think Could you open a spinoff issue? |
||
// wrapped as LRU? | ||
synchronized (categoryCache) { | ||
return categoryCache.get(ordinal); | ||
} | ||
} | ||
|
||
private void checkOrdinalBounds(int ordinal, int indexReaderMaxDoc) | ||
throws IllegalArgumentException { | ||
if (ordinal < 0 || ordinal >= indexReaderMaxDoc) { | ||
throw new IllegalArgumentException( | ||
"ordinal " | ||
+ ordinal | ||
+ " is out of the range of the indexReader " | ||
+ indexReader.toString()); | ||
} | ||
} | ||
|
||
/** | ||
* Returns an array of FacetLabels for a given array of ordinals. | ||
* | ||
* <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of | ||
* ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index | ||
* was created using StoredFields (with no performance gains) and uses DocValues based iteration | ||
* when the index is based on DocValues. | ||
* | ||
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy | ||
* index | ||
*/ | ||
public FacetLabel[] getBulkPath(int... ordinals) throws IOException { | ||
ensureOpen(); | ||
|
||
int ordinalsLength = ordinals.length; | ||
FacetLabel[] bulkPath = new FacetLabel[ordinalsLength]; | ||
// remember the original positions of ordinals before they are sorted | ||
int originalPosition[] = new int[ordinalsLength]; | ||
Arrays.setAll(originalPosition, IntUnaryOperator.identity()); | ||
int indexReaderMaxDoc = indexReader.maxDoc(); | ||
|
||
for (int i = 0; i < ordinalsLength; i++) { | ||
// check whether the ordinal is valid before accessing the cache | ||
checkOrdinalBounds(ordinals[i], indexReaderMaxDoc); | ||
// check the cache before trying to find it in the index | ||
FacetLabel ordinalPath = getPathFromCache(ordinals[i]); | ||
if (ordinalPath != null) { | ||
bulkPath[i] = ordinalPath; | ||
} | ||
} | ||
|
||
// parallel sort the ordinals and originalPosition array based on the values in the ordinals | ||
// array | ||
new InPlaceMergeSorter() { | ||
gautamworah96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
@Override | ||
protected void swap(int i, int j) { | ||
int x = ordinals[i]; | ||
ordinals[i] = ordinals[j]; | ||
ordinals[j] = x; | ||
|
||
x = originalPosition[i]; | ||
originalPosition[i] = originalPosition[j]; | ||
originalPosition[j] = x; | ||
} | ||
; | ||
gautamworah96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@Override | ||
public int compare(int i, int j) { | ||
return Integer.compare(ordinals[i], ordinals[j]); | ||
} | ||
}.sort(0, ordinalsLength); | ||
|
||
int readerIndex; | ||
int leafReaderMaxDoc = 0; | ||
int leafReaderDocBase = 0; | ||
LeafReader leafReader; | ||
LeafReaderContext leafReaderContext; | ||
BinaryDocValues values = null; | ||
|
||
for (int i = 0; i < ordinalsLength; i++) { | ||
if (bulkPath[originalPosition[i]] == null) { | ||
if (values == null || ordinals[i] >= leafReaderMaxDoc) { | ||
|
||
readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves()); | ||
leafReaderContext = indexReader.leaves().get(readerIndex); | ||
leafReader = leafReaderContext.reader(); | ||
leafReaderMaxDoc = leafReader.maxDoc(); | ||
leafReaderDocBase = leafReaderContext.docBase; | ||
values = leafReader.getBinaryDocValues(Consts.FULL); | ||
|
||
// this check is only needed once to confirm that the index uses BinaryDocValues | ||
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase); | ||
if (success == false) { | ||
return getBulkPathForOlderIndexes(ordinals); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I'm confused -- wouldn't an older index have no This code should hit |
||
} | ||
} | ||
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase); | ||
assert success; | ||
bulkPath[originalPosition[i]] = | ||
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString())); | ||
} | ||
} | ||
|
||
for (int i = 0; i < ordinalsLength; i++) { | ||
synchronized (categoryCache) { | ||
gautamworah96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We will sometimes put ordinals back into the cache that were already there at the start of this method right? I guess that's harmless. Or, maybe we should move this up above? Then we can do it only for those ordinals that were not already cached? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think intuitively adding the ordinals back into the cache would not be a problem. This should also (theoretically) be faster than trying to get the lock again and again in a loop? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Hmm, I'm confused: this code is already getting the lock inside a |
||
} | ||
} | ||
|
||
return bulkPath; | ||
} | ||
|
||
/** | ||
* This function is only used when the underlying taxonomy index was constructed using an older | ||
* (slower) StoredFields based codec (< 8.7). The {@link #getBulkPath(int...)} function calls it | ||
* internally when it realizes that the index uses StoredFields. | ||
*/ | ||
private FacetLabel[] getBulkPathForOlderIndexes(int... ordinals) throws IOException { | ||
FacetLabel[] bulkPath = new FacetLabel[ordinals.length]; | ||
for (int i = 0; i < ordinals.length; i++) { | ||
bulkPath[i] = getPath(ordinals[i]); | ||
} | ||
|
||
return bulkPath; | ||
} | ||
|
||
@Override | ||
public int getSize() { | ||
ensureOpen(); | ||
|
Uh oh!
There was an error while loading. Please reload this page.