Skip to content

Commit 51e6fbb

Browse files
Optimize FieldExistsQuery to leverage index statistic in DocValuesSkipper (#14830)
1 parent 2b47cd3 commit 51e6fbb

File tree

3 files changed

+66
-9
lines changed

3 files changed

+66
-9
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ Optimizations
177177
* GITHUB#14836: Sometimes use FixedBitSet when doing HNSW searches. This slightly improves HNSW
178178
search performance on smaller graphs. (Ben Trent)
179179

180+
* GITHUB#14830: Optimize FieldExistsQuery to leverage index statistic in DocValuesSkipper. (Pan Guixin)
181+
180182
Bug Fixes
181183
---------------------
182184
* GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when

lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import org.apache.lucene.document.KnnFloatVectorField;
2222
import org.apache.lucene.index.ByteVectorValues;
2323
import org.apache.lucene.index.DocValues;
24+
import org.apache.lucene.index.DocValuesSkipIndexType;
25+
import org.apache.lucene.index.DocValuesSkipper;
2426
import org.apache.lucene.index.DocValuesType;
2527
import org.apache.lucene.index.FieldInfo;
2628
import org.apache.lucene.index.FieldInfos;
@@ -138,17 +140,14 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
138140
!= DocValuesType.NONE) { // the field indexes doc values or points
139141

140142
// This optimization is possible due to LUCENE-9334 enforcing a field to always use the
141-
// same data structures (all or nothing). Since there's no index statistic to detect when
142-
// all documents have doc values for a specific field, FieldExistsQuery can only be
143-
// rewritten to MatchAllDocsQuery for doc values field, when that same field also indexes
144-
// terms or point values which do have index statistics, and those statistics confirm that
145-
// all documents in this segment have values terms or point values.
146-
147-
Terms terms = leaf.terms(field);
148-
PointValues pointValues = leaf.getPointValues(field);
143+
// same data structures (all or nothing).
144+
final Terms terms = leaf.terms(field);
145+
final PointValues pointValues = leaf.getPointValues(field);
146+
final DocValuesSkipper docValuesSkipper = leaf.getDocValuesSkipper(field);
149147

150148
if ((terms == null || terms.getDocCount() != leaf.maxDoc())
151-
&& (pointValues == null || pointValues.getDocCount() != leaf.maxDoc())) {
149+
&& (pointValues == null || pointValues.getDocCount() != leaf.maxDoc())
150+
&& (docValuesSkipper == null || docValuesSkipper.docCount() != leaf.maxDoc())) {
152151
allReadersRewritable = false;
153152
break;
154153
}
@@ -248,6 +247,9 @@ public int count(LeafReaderContext context) throws IOException {
248247
} else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
249248
Terms terms = reader.terms(field);
250249
return terms == null ? 0 : terms.getDocCount();
250+
} else if (fieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) {
251+
DocValuesSkipper docValuesSkipper = reader.getDocValuesSkipper(field);
252+
return docValuesSkipper == null ? 0 : docValuesSkipper.docCount();
251253
}
252254
}
253255

lucene/core/src/test/org/apache/lucene/search/TestFieldExistsQuery.java

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,24 @@ public void testDocValuesRewriteWithPointValuesPresent() throws IOException {
8888
dir.close();
8989
}
9090

91+
public void testDocValuesRewriteWithDocValuesSkipperPresent() throws IOException {
92+
Directory dir = newDirectory();
93+
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
94+
final int numDocs = atLeast(100);
95+
for (int i = 0; i < numDocs; ++i) {
96+
Document doc = new Document();
97+
doc.add(NumericDocValuesField.indexedField("dim", 2));
98+
iw.addDocument(doc);
99+
}
100+
iw.commit();
101+
final IndexReader reader = iw.getReader();
102+
iw.close();
103+
104+
assertEquals(new MatchAllDocsQuery(), new FieldExistsQuery("dim").rewrite(newSearcher(reader)));
105+
reader.close();
106+
dir.close();
107+
}
108+
91109
public void testDocValuesNoRewrite() throws IOException {
92110
Directory dir = newDirectory();
93111
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
@@ -741,6 +759,41 @@ private float[] randomVector(int dim) {
741759
return v;
742760
}
743761

762+
public void testDeleteDocValues() throws IOException {
763+
try (Directory dir = newDirectory();
764+
RandomIndexWriter iw = new RandomIndexWriter(random(), dir)) {
765+
final int numDocs = atLeast(100);
766+
767+
boolean allDocsHaveValue = random().nextBoolean();
768+
BitSet docWithValue = new FixedBitSet(numDocs);
769+
for (int i = 0; i < numDocs; ++i) {
770+
Document doc = new Document();
771+
if (allDocsHaveValue || random().nextBoolean()) {
772+
doc.add(NumericDocValuesField.indexedField("num", i));
773+
docWithValue.set(i);
774+
}
775+
doc.add(new StringField("id", Integer.toString(i), Store.NO));
776+
iw.addDocument(doc);
777+
}
778+
779+
if (random().nextBoolean()) {
780+
final int numDeleted = random().nextInt(numDocs) + 1;
781+
for (int i = 0; i < numDeleted; ++i) {
782+
int id = random().nextInt(numDocs);
783+
iw.deleteDocuments(new Term("id", Integer.toString(id)));
784+
docWithValue.clear(id);
785+
}
786+
}
787+
788+
try (IndexReader reader = iw.getReader()) {
789+
final IndexSearcher searcher = newSearcher(reader);
790+
791+
final int count = searcher.count(new FieldExistsQuery("num"));
792+
assertEquals(docWithValue.cardinality(), count);
793+
}
794+
}
795+
}
796+
744797
public void testDeleteAllPointDocs() throws Exception {
745798
try (Directory dir = newDirectory();
746799
RandomIndexWriter iw = new RandomIndexWriter(random(), dir)) {

0 commit comments

Comments
 (0)