Skip to content

Commit 755de5a

Browse files
authored
Performance improvements to MatchHighlighter and MatchRegionRetriever (apache#12881)
1 parent b699981 commit 755de5a

File tree

10 files changed

+434
-162
lines changed

10 files changed

+434
-162
lines changed

lucene/CHANGES.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ API Changes
7373
* GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery,
7474
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski)
7575

76+
* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be
77+
configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields
78+
of a document are accessed once. A configurable limit of field matches placed in the priority queue was added
79+
(allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's
80+
executor to extract hit offsets concurrently. (Dawid Weiss)
81+
7682
New Features
7783
---------------------
7884

lucene/MIGRATE.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919

2020
## Migration from Lucene 9.x to Lucene 10.0
2121

22+
### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881)
23+
24+
The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get
25+
a sense of what's changed (changes are minor).
26+
2227
### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
2328

2429
The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.

lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(
6161
@Override
6262
public List<String> format(
6363
String field,
64-
String[] values,
64+
List<String> values,
6565
String contiguousValue,
6666
List<OffsetRange> valueRanges,
6767
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
@@ -99,7 +99,7 @@ public static MatchHighlighter.FieldValueHighlighter highlighted(
9999
@Override
100100
public List<String> format(
101101
String field,
102-
String[] values,
102+
List<String> values,
103103
String contiguousValue,
104104
List<OffsetRange> valueRanges,
105105
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
@@ -128,11 +128,11 @@ public Collection<String> alwaysFetchedFields() {
128128
@Override
129129
public List<String> format(
130130
String field,
131-
String[] values,
131+
List<String> values,
132132
String contiguousValue,
133133
List<OffsetRange> valueRanges,
134134
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
135-
return Arrays.asList(values);
135+
return values;
136136
}
137137
};
138138
}
@@ -146,7 +146,7 @@ public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
146146
@Override
147147
public List<String> format(
148148
String field,
149-
String[] values,
149+
List<String> values,
150150
String contiguousValue,
151151
List<OffsetRange> valueRanges,
152152
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {

lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java

Lines changed: 43 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
package org.apache.lucene.search.matchhighlight;
1818

1919
import java.io.IOException;
20-
import java.io.UncheckedIOException;
2120
import java.util.ArrayList;
2221
import java.util.Collection;
2322
import java.util.Collections;
@@ -29,23 +28,19 @@
2928
import java.util.function.Predicate;
3029
import java.util.stream.Stream;
3130
import org.apache.lucene.analysis.Analyzer;
32-
import org.apache.lucene.document.Document;
33-
import org.apache.lucene.document.DocumentStoredFieldVisitor;
34-
import org.apache.lucene.index.FieldInfo;
35-
import org.apache.lucene.index.IndexableField;
3631
import org.apache.lucene.index.LeafReader;
3732
import org.apache.lucene.search.IndexSearcher;
3833
import org.apache.lucene.search.Query;
3934
import org.apache.lucene.search.ScoreDoc;
4035
import org.apache.lucene.search.TopDocs;
4136

4237
/**
43-
* An example highlighter that combines several lower-level highlighting utilities in this package
44-
* into a fully featured, ready-to-use component.
38+
* An example highlighter that combines several lower-level utility classes in this package into a
39+
* fully featured, ready-to-use component.
4540
*
46-
* <p>Note that if you need to customize or tweak the details of highlighting, it is better to
47-
* assemble your own highlighter using those low-level building blocks, rather than extend or modify
48-
* this one.
41+
* <p>Note: if you need to customize or tweak the details of highlighting, it is better to assemble
42+
* your own highlighter using those low-level building blocks, rather than extend or modify this
43+
* one.
4944
*/
5045
public class MatchHighlighter {
5146
private final IndexSearcher searcher;
@@ -71,16 +66,16 @@ public interface FieldValueHighlighter {
7166
*/
7267
boolean isApplicable(String field, boolean hasMatches);
7368

74-
/** Do format field values appropriately. */
69+
/** Format field values into a list of final "highlights". */
7570
List<String> format(
7671
String field,
77-
String[] values,
72+
List<String> values,
7873
String contiguousValue,
7974
List<OffsetRange> valueRanges,
8075
List<QueryOffsetRange> matchOffsets);
8176

8277
/**
83-
* @return Returns a set of fields that must be fetched for each document, regardless of whether
78+
* @return Returns a set of fields that must be loaded from each document, regardless of whether
8479
* they had matches or not. This is useful to load and return certain fields that should
8580
* always be included (identifiers, document titles, etc.).
8681
*/
@@ -106,7 +101,7 @@ public boolean isApplicable(String field, boolean hasMatches) {
106101
@Override
107102
public List<String> format(
108103
String field,
109-
String[] values,
104+
List<String> values,
110105
String contiguousValue,
111106
List<OffsetRange> valueRanges,
112107
List<QueryOffsetRange> matchOffsets) {
@@ -169,14 +164,14 @@ public QueryOffsetRange slice(int from, int to) {
169164

170165
private static class DocHit {
171166
final int docId;
172-
private final LeafReader leafReader;
173-
private final int leafDocId;
174167
private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges = new LinkedHashMap<>();
168+
private final LinkedHashMap<String, List<String>> fieldValues = new LinkedHashMap<>();
175169

176-
DocHit(int docId, LeafReader leafReader, int leafDocId) {
170+
DocHit(int docId, MatchRegionRetriever.FieldValueProvider fieldValueProvider) {
177171
this.docId = docId;
178-
this.leafReader = leafReader;
179-
this.leafDocId = leafDocId;
172+
for (var fieldName : fieldValueProvider) {
173+
fieldValues.put(fieldName, fieldValueProvider.getValues(fieldName));
174+
}
180175
}
181176

182177
void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
@@ -187,22 +182,6 @@ void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
187182
offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
188183
});
189184
}
190-
191-
Document document(Predicate<String> needsField) throws IOException {
192-
// Only load the fields that have a chance to be highlighted.
193-
DocumentStoredFieldVisitor visitor =
194-
new DocumentStoredFieldVisitor() {
195-
@Override
196-
public Status needsField(FieldInfo fieldInfo) {
197-
return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name))
198-
? Status.YES
199-
: Status.NO;
200-
}
201-
};
202-
203-
leafReader.storedFields().document(leafDocId, visitor);
204-
return visitor.getDocument();
205-
}
206185
}
207186

208187
public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
@@ -223,25 +202,44 @@ public MatchHighlighter(
223202

224203
public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
225204
// We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
226-
// for streaming, so we'll just prepopulate the map in proper order.
205+
// for streaming, so we'll just populate the map in proper order.
227206
LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
228207
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
229208
docHits.put(scoreDoc.doc, null);
230209
}
231210

211+
Predicate<String> fieldsToLoadUnconditionally = fieldsAlwaysReturned::contains;
212+
Predicate<String> fieldsToLoadIfWithHits =
213+
fieldName -> {
214+
// We're interested in any fields for which existing highlighters are applicable (with or
215+
// without hits).
216+
return fieldHighlighters.stream()
217+
.anyMatch(
218+
highlighter ->
219+
highlighter.isApplicable(fieldName, true)
220+
|| highlighter.isApplicable(fieldName, false));
221+
};
222+
232223
// Collect match ranges for each query and associate each range to the origin query.
233224
for (Query q : queries) {
234225
MatchRegionRetriever highlighter =
235-
new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
226+
new MatchRegionRetriever(
227+
searcher,
228+
searcher.rewrite(q),
229+
offsetsRetrievalStrategies,
230+
fieldsToLoadUnconditionally,
231+
fieldsToLoadIfWithHits);
232+
236233
highlighter.highlightDocuments(
237234
topDocs,
238235
(int docId,
239236
LeafReader leafReader,
240237
int leafDocId,
238+
MatchRegionRetriever.FieldValueProvider fieldValueProvider,
241239
Map<String, List<OffsetRange>> hits) -> {
242240
DocHit docHit = docHits.get(docId);
243241
if (docHit == null) {
244-
docHit = new DocHit(docId, leafReader, leafDocId);
242+
docHit = new DocHit(docId, fieldValueProvider);
245243
docHits.put(docId, docHit);
246244
}
247245
docHit.addMatches(q, hits);
@@ -254,23 +252,11 @@ public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws
254252
}
255253

256254
private DocHighlights computeDocFieldValues(DocHit docHit) {
257-
Document doc;
258-
try {
259-
doc = docHit.document(fieldsAlwaysReturned::contains);
260-
} catch (IOException e) {
261-
throw new UncheckedIOException(e);
262-
}
263-
264255
DocHighlights docHighlights = new DocHighlights(docHit.docId);
265256

266-
HashSet<String> unique = new HashSet<>();
267-
for (IndexableField indexableField : doc) {
268-
String field = indexableField.name();
269-
if (!unique.add(field)) {
270-
continue;
271-
}
272-
273-
String[] values = doc.getValues(field);
257+
for (var e : docHit.fieldValues.entrySet()) {
258+
String field = e.getKey();
259+
List<String> values = e.getValue();
274260
String contiguousValue = contiguousFieldValue(field, values);
275261
List<OffsetRange> valueRanges = computeValueRanges(field, values);
276262
List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
@@ -287,7 +273,7 @@ private DocHighlights computeDocFieldValues(DocHit docHit) {
287273
return docHighlights;
288274
}
289275

290-
private List<OffsetRange> computeValueRanges(String field, String[] values) {
276+
private List<OffsetRange> computeValueRanges(String field, List<String> values) {
291277
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
292278
int offset = 0;
293279
for (CharSequence v : values) {
@@ -298,10 +284,10 @@ private List<OffsetRange> computeValueRanges(String field, String[] values) {
298284
return valueRanges;
299285
}
300286

301-
private String contiguousFieldValue(String field, String[] values) {
287+
private String contiguousFieldValue(String field, List<String> values) {
302288
String value;
303-
if (values.length == 1) {
304-
value = values[0];
289+
if (values.size() == 1) {
290+
value = values.get(0);
305291
} else {
306292
// TODO: This can be inefficient if offset gap is large but the logic
307293
// of applying offsets would get much more complicated so leaving for now

0 commit comments

Comments
 (0)