dungba88
diff --git a/‎lucene/CHANGES.txt‎
Lines changed: 6 additions & 0 deletions b/‎lucene/CHANGES.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lucene/MIGRATE.md‎
Lines changed: 5 additions & 0 deletions b/‎lucene/MIGRATE.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java‎
Lines changed: 5 additions & 5 deletions b/‎lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java‎
Lines changed: 43 additions & 57 deletions b/‎lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java‎
Lines changed: 43 additions & 57 deletions
@@ -73,6 +73,12 @@ API Changes
 * GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery,
   SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski)
 
+* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be
+  configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields
+  of a document are accessed once. A configurable limit of field matches placed in the priority queue was added
+  (allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's
+  executor to extract hit offsets concurrently. (Dawid Weiss)
+
 New Features
 ---------------------
 
 
@@ -19,6 +19,11 @@
 
 ## Migration from Lucene 9.x to Lucene 10.0
 
+### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881)
+
+The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get
+a sense of what's changed (changes are minor).
+
 ### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
 
 The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.
 
@@ -61,7 +61,7 @@ public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(
       @Override
       public List<String> format(
           String field,
-          String[] values,
+          List<String> values,
           String contiguousValue,
           List<OffsetRange> valueRanges,
           List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
@@ -99,7 +99,7 @@ public static MatchHighlighter.FieldValueHighlighter highlighted(
       @Override
       public List<String> format(
           String field,
-          String[] values,
+          List<String> values,
           String contiguousValue,
           List<OffsetRange> valueRanges,
           List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
@@ -128,11 +128,11 @@ public Collection<String> alwaysFetchedFields() {
       @Override
       public List<String> format(
           String field,
-          String[] values,
+          List<String> values,
           String contiguousValue,
           List<OffsetRange> valueRanges,
           List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
-        return Arrays.asList(values);
+        return values;
       }
     };
   }
@@ -146,7 +146,7 @@ public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
       @Override
       public List<String> format(
           String field,
-          String[] values,
+          List<String> values,
           String contiguousValue,
           List<OffsetRange> valueRanges,
           List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
 
@@ -17,7 +17,6 @@
 package org.apache.lucene.search.matchhighlight;
 
 import java.io.IOException;
-import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -29,23 +28,19 @@
 import java.util.function.Predicate;
 import java.util.stream.Stream;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.DocumentStoredFieldVisitor;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 
 /**
- * An example highlighter that combines several lower-level highlighting utilities in this package
- * into a fully featured, ready-to-use component.
+ * An example highlighter that combines several lower-level utility classes in this package into a
+ * fully featured, ready-to-use component.
  *
- * <p>Note that if you need to customize or tweak the details of highlighting, it is better to
- * assemble your own highlighter using those low-level building blocks, rather than extend or modify
- * this one.
+ * <p>Note: if you need to customize or tweak the details of highlighting, it is better to assemble
+ * your own highlighter using those low-level building blocks, rather than extend or modify this
+ * one.
  */
 public class MatchHighlighter {
   private final IndexSearcher searcher;
@@ -71,16 +66,16 @@ public interface FieldValueHighlighter {
      */
     boolean isApplicable(String field, boolean hasMatches);
 
-    /** Do format field values appropriately. */
+    /** Format field values into a list of final "highlights". */
     List<String> format(
         String field,
-        String[] values,
+        List<String> values,
         String contiguousValue,
         List<OffsetRange> valueRanges,
         List<QueryOffsetRange> matchOffsets);
 
     /**
-     * @return Returns a set of fields that must be fetched for each document, regardless of whether
+     * @return Returns a set of fields that must be loaded from each document, regardless of whether
      *     they had matches or not. This is useful to load and return certain fields that should
      *     always be included (identifiers, document titles, etc.).
      */
@@ -106,7 +101,7 @@ public boolean isApplicable(String field, boolean hasMatches) {
         @Override
         public List<String> format(
             String field,
-            String[] values,
+            List<String> values,
             String contiguousValue,
             List<OffsetRange> valueRanges,
             List<QueryOffsetRange> matchOffsets) {
@@ -169,14 +164,14 @@ public QueryOffsetRange slice(int from, int to) {
 
   private static class DocHit {
     final int docId;
-    private final LeafReader leafReader;
-    private final int leafDocId;
     private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges = new LinkedHashMap<>();
+    private final LinkedHashMap<String, List<String>> fieldValues = new LinkedHashMap<>();
 
-    DocHit(int docId, LeafReader leafReader, int leafDocId) {
+    DocHit(int docId, MatchRegionRetriever.FieldValueProvider fieldValueProvider) {
       this.docId = docId;
-      this.leafReader = leafReader;
-      this.leafDocId = leafDocId;
+      for (var fieldName : fieldValueProvider) {
+        fieldValues.put(fieldName, fieldValueProvider.getValues(fieldName));
+      }
     }
 
     void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
@@ -187,22 +182,6 @@ void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
             offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
           });
     }
-
-    Document document(Predicate<String> needsField) throws IOException {
-      // Only load the fields that have a chance to be highlighted.
-      DocumentStoredFieldVisitor visitor =
-          new DocumentStoredFieldVisitor() {
-            @Override
-            public Status needsField(FieldInfo fieldInfo) {
-              return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name))
-                  ? Status.YES
-                  : Status.NO;
-            }
-          };
-
-      leafReader.storedFields().document(leafDocId, visitor);
-      return visitor.getDocument();
-    }
   }
 
   public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
@@ -223,25 +202,44 @@ public MatchHighlighter(
 
   public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
     // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
-    // for streaming, so we'll just prepopulate the map in proper order.
+    // for streaming, so we'll just populate the map in proper order.
     LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
     for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
       docHits.put(scoreDoc.doc, null);
     }
 
+    Predicate<String> fieldsToLoadUnconditionally = fieldsAlwaysReturned::contains;
+    Predicate<String> fieldsToLoadIfWithHits =
+        fieldName -> {
+          // We're interested in any fields for which existing highlighters are applicable (with or
+          // without hits).
+          return fieldHighlighters.stream()
+              .anyMatch(
+                  highlighter ->
+                      highlighter.isApplicable(fieldName, true)
+                          || highlighter.isApplicable(fieldName, false));
+        };
+
     // Collect match ranges for each query and associate each range to the origin query.
     for (Query q : queries) {
       MatchRegionRetriever highlighter =
-          new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
+          new MatchRegionRetriever(
+              searcher,
+              searcher.rewrite(q),
+              offsetsRetrievalStrategies,
+              fieldsToLoadUnconditionally,
+              fieldsToLoadIfWithHits);
+
       highlighter.highlightDocuments(
           topDocs,
           (int docId,
               LeafReader leafReader,
               int leafDocId,
+              MatchRegionRetriever.FieldValueProvider fieldValueProvider,
               Map<String, List<OffsetRange>> hits) -> {
             DocHit docHit = docHits.get(docId);
             if (docHit == null) {
-              docHit = new DocHit(docId, leafReader, leafDocId);
+              docHit = new DocHit(docId, fieldValueProvider);
               docHits.put(docId, docHit);
             }
             docHit.addMatches(q, hits);
@@ -254,23 +252,11 @@ public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws
   }
 
   private DocHighlights computeDocFieldValues(DocHit docHit) {
-    Document doc;
-    try {
-      doc = docHit.document(fieldsAlwaysReturned::contains);
-    } catch (IOException e) {
-      throw new UncheckedIOException(e);
-    }
-
     DocHighlights docHighlights = new DocHighlights(docHit.docId);
 
-    HashSet<String> unique = new HashSet<>();
-    for (IndexableField indexableField : doc) {
-      String field = indexableField.name();
-      if (!unique.add(field)) {
-        continue;
-      }
-
-      String[] values = doc.getValues(field);
+    for (var e : docHit.fieldValues.entrySet()) {
+      String field = e.getKey();
+      List<String> values = e.getValue();
       String contiguousValue = contiguousFieldValue(field, values);
       List<OffsetRange> valueRanges = computeValueRanges(field, values);
       List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
@@ -287,7 +273,7 @@ private DocHighlights computeDocFieldValues(DocHit docHit) {
     return docHighlights;
   }
 
-  private List<OffsetRange> computeValueRanges(String field, String[] values) {
+  private List<OffsetRange> computeValueRanges(String field, List<String> values) {
     ArrayList<OffsetRange> valueRanges = new ArrayList<>();
     int offset = 0;
     for (CharSequence v : values) {
@@ -298,10 +284,10 @@ private List<OffsetRange> computeValueRanges(String field, String[] values) {
     return valueRanges;
   }
 
-  private String contiguousFieldValue(String field, String[] values) {
+  private String contiguousFieldValue(String field, List<String> values) {
     String value;
-    if (values.length == 1) {
-      value = values[0];
+    if (values.size() == 1) {
+      value = values.get(0);
     } else {
       // TODO: This can be inefficient if offset gap is large but the logic
       // of applying offsets would get much more complicated so leaving for now