Skip to content

Commit a2db4f3

Browse files
authored
Allow TermsEnum to express a preference for seekExact to FrozenBufferedUpdates (#15636)
For Terms implementations that have an approximate membership filter, the current scheme of repeated seekCeil() calls when evaluating update/deletion terms can cause the membership filter to be skipped as we must provide the term lexically after the target on a miss. Allow TermsEnum to express a preference for seekExact() so that approximate membership filters still work in this case. Implement preferSeekExact() in BloomFilteringPostingsFormat where there is obvious benefit.
1 parent dbf6b89 commit a2db4f3

File tree

5 files changed

+60
-5
lines changed

5 files changed

+60
-5
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,9 @@ API Changes
181181
* GITHUB#15663: Allow subclasses of NumericComparator to implement their own
182182
CompetitiveDISIBuilder subtypes. (Alan Woodward)
183183

184+
* GITHUB#15636: Introduce TermsEnum.preferSeekExact(). Update and delete processing obey this setting,
185+
allowing bloom or other approximate membership filters to apply in these paths. (Trevor McCulloch)
186+
184187
New Features
185188
---------------------
186189
(No changes)

lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,13 @@ public ImpactsEnum impacts(int flags) throws IOException {
373373
return delegate().impacts(flags);
374374
}
375375

376+
@Override
377+
public boolean preferSeekExact() {
378+
// Prefer seekExact() to seekCeil() when processing updates and deletes,
379+
// since seekExact() passes through the bloom filter.
380+
return true;
381+
}
382+
376383
@Override
377384
public String toString() {
378385
return getClass().getSimpleName() + "(filter=" + filter.toString() + ")";

lucene/core/src/java/org/apache/lucene/index/FrozenBufferedUpdates.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ private void setField(String field) throws IOException {
583583
DocIdSetIterator nextTerm(String field, BytesRef term) throws IOException {
584584
setField(field);
585585
if (termsEnum != null) {
586-
if (sortedTerms) {
586+
if (sortedTerms && !termsEnum.preferSeekExact()) {
587587
assert assertSorted(term);
588588
// in the sorted case we can take advantage of the "seeking forward" property
589589
// this allows us depending on the term dict impl to reuse data-structures internally

lucene/core/src/java/org/apache/lucene/index/TermsEnum.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,17 @@ public final PostingsEnum postings(PostingsEnum reuse) throws IOException {
188188
*/
189189
public abstract TermState termState() throws IOException;
190190

191+
/**
192+
* Expert: prefer using seekExact() to seekCeil().
193+
*
194+
* <p>This is consumed within Lucene when performing key lookups for update and delete operations.
195+
* Override this for implementations that use an approximate membership filter to ensure that the
196+
* membership filter is consulted.
197+
*/
198+
public boolean preferSeekExact() {
199+
return false;
200+
}
201+
191202
/**
192203
* An empty TermsEnum for quickly returning an empty instance e.g. in {@link
193204
* org.apache.lucene.search.MultiTermQuery}
@@ -196,11 +207,10 @@ public final PostingsEnum postings(PostingsEnum reuse) throws IOException {
196207
* Attributes to it. This should not be a problem, as the enum is always empty and the existence
197208
* of unused Attributes does not matter.
198209
*/
199-
// Avoid refactoring that results in a dependency on a subclass, like BaseTermsEnum.
200-
// See: https://github.com/apache/lucene/issues/15317
210+
// Avoid refactoring that results in a dependency on a subclass, like
211+
// BaseTermsEnum. See: https://github.com/apache/lucene/issues/15317
201212
public static final TermsEnum EMPTY =
202213
new TermsEnum() {
203-
204214
private AttributeSource atts;
205215

206216
@Override

lucene/core/src/test/org/apache/lucene/index/TestFrozenBufferedUpdates.java

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,36 @@
3939
import org.apache.lucene.util.FixedBitSet;
4040

4141
public class TestFrozenBufferedUpdates extends LuceneTestCase {
42+
private static class PreferSeekExactLeafReader extends FilterLeafReader {
43+
public PreferSeekExactLeafReader(LeafReader in) {
44+
super(in);
45+
}
46+
47+
@Override
48+
public Terms terms(String field) throws IOException {
49+
return new FilterTerms(in.terms(field)) {
50+
@Override
51+
public TermsEnum iterator() throws IOException {
52+
return new FilterTermsEnum(in.iterator()) {
53+
@Override
54+
public boolean preferSeekExact() {
55+
return true;
56+
}
57+
};
58+
}
59+
};
60+
}
61+
62+
@Override
63+
public IndexReader.CacheHelper getCoreCacheHelper() {
64+
return this.in.getCoreCacheHelper();
65+
}
66+
67+
@Override
68+
public IndexReader.CacheHelper getReaderCacheHelper() {
69+
return this.in.getReaderCacheHelper();
70+
}
71+
}
4272

4373
public void testTermDocsIterator() throws IOException {
4474
for (int j = 0; j < 5; j++) {
@@ -76,10 +106,15 @@ public void testTermDocsIterator() throws IOException {
76106
writer.commit();
77107
try (DirectoryReader reader = DirectoryReader.open(dir)) {
78108
boolean sorted = random().nextBoolean();
109+
boolean preferSeekExact = random().nextBoolean();
79110
BytesRefIterator values =
80111
sorted ? array.iterator(Comparator.naturalOrder()) : array.iterator();
81112
assertEquals(1, reader.leaves().size());
82-
TermDocsIterator iterator = new TermDocsIterator(reader.leaves().get(0).reader(), sorted);
113+
LeafReader leafReader = reader.leaves().get(0).reader();
114+
if (preferSeekExact) {
115+
leafReader = new PreferSeekExactLeafReader(leafReader);
116+
}
117+
TermDocsIterator iterator = new TermDocsIterator(leafReader, sorted);
83118
FixedBitSet bitSet = new FixedBitSet(reader.maxDoc());
84119
BytesRef ref;
85120
while ((ref = values.next()) != null) {

0 commit comments

Comments
 (0)