more tests; javadoc; tidy up

romseygeek · romseygeek · commit 3e402323a3d2 · 2022-12-02T14:26:40.000Z
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -64,6 +64,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Objects;
 
 import static java.util.Collections.singletonList;
 import static java.util.Collections.singletonMap;
@@ -106,7 +107,7 @@
 
 public class HighlighterSearchIT extends ESIntegTestCase {
     // TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
-    private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified" };
+    private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified", "matches" };
 
     @Override
     protected Collection<Class<? extends Plugin>> nodePlugins() {
@@ -3508,6 +3509,9 @@ public void testWithNestedQuery() throws Exception {
         // but we highlight the root text field since nested documents cannot be highlighted with postings nor term vectors
         // directly.
         for (String type : ALL_TYPES) {
+            if (Objects.equals("matches", type)) {
+                continue; // matches highlighter doesn't support nested fields
+            }
             SearchResponse searchResponse = client().prepareSearch()
                 .setQuery(nestedQuery("foo", prefixQuery("foo.text", "bro"), ScoreMode.None))
                 .highlighter(new HighlightBuilder().field(new Field("text").highlighterType(type).requireFieldMatch(false)))
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java
@@ -160,10 +160,7 @@ private FieldContext contextBuilders(
                     )
                 );
             }
-            // TODO in future we can load the storedFields in advance here and make use of them,
-            // but for now they are loaded separately in HighlightUtils so we only return whether
-            // or not we need source.
-            storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, Set.of()));
+            storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, storedFields));
         }
         return new FieldContext(storedFieldsSpec, builders);
     }
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java
@@ -43,11 +43,8 @@ public static List<Object> loadFieldValues(
         FetchSubPhase.HitContext hitContext,
         boolean forceSource
     ) throws IOException {
-        if (forceSource == false && fieldType.isStored()) {
-            CustomFieldsVisitor fieldVisitor = new CustomFieldsVisitor(singleton(fieldType.name()), false);
-            hitContext.reader().document(hitContext.docId(), fieldVisitor);
-            List<Object> textsToHighlight = fieldVisitor.fields().get(fieldType.name());
-            return Objects.requireNonNullElse(textsToHighlight, Collections.emptyList());
+        if (forceSource == false && hitContext.loadedFields().containsKey(fieldType.name())) {
+            return hitContext.loadedFields().get(fieldType.name());
         }
         ValueFetcher fetcher = fieldType.valueFetcher(searchContext, null);
         fetcher.setNextReader(hitContext.readerContext());
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java
@@ -9,61 +9,68 @@
 package org.elasticsearch.search.fetch.subphase.highlight;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.AnalyzerWrapper;
 import org.apache.lucene.search.FilterMatchesIterator;
 import org.apache.lucene.search.Matches;
 import org.apache.lucene.search.MatchesIterator;
-import org.apache.lucene.search.highlight.OffsetLimitTokenFilter;
+import org.apache.lucene.search.MatchesUtils;
+import org.apache.lucene.search.matchhighlight.MatchRegionRetriever;
 import org.apache.lucene.search.matchhighlight.OffsetRange;
 import org.apache.lucene.search.matchhighlight.OffsetsFromTokens;
 import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy;
 import org.apache.lucene.search.matchhighlight.Passage;
 import org.apache.lucene.search.matchhighlight.PassageFormatter;
-import org.apache.lucene.search.matchhighlight.PassageSelector;
-import org.apache.lucene.search.uhighlight.PassageScorer;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.index.mapper.TextSearchInfo;
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.List;
+import java.util.Set;
 
-public class MatchesFieldHighlighter {
+/**
+ * Highlights individual fields using components from lucene's match highlighter
+ */
+class MatchesFieldHighlighter {
 
     private final FieldHighlightContext context;
     private final Matches matches;
     private final Analyzer analyzer;
     private final String field;
 
-    public MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException {
+    MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException {
         this.context = context;
         // TODO term vectors and require_field_match=false should intercept things here
-        this.matches = state.getMatches(context.query, context.hitContext.docId());
+        this.matches = state.getMatches(context.query, context.hitContext.readerContext(), context.hitContext.docId());
         this.analyzer = context.context.getSearchExecutionContext().getIndexAnalyzer(s -> Lucene.STANDARD_ANALYZER);
         this.field = context.fieldType.name();
     }
 
-    public MatchesIterator getMatchesIterator() throws IOException {
+    /**
+     * @return a MatchesIterator for this field, based on the field highlighter configuration
+     */
+    MatchesIterator getMatchesIterator() throws IOException {
         if (this.matches == null) {
             return null;
         }
-        MatchesIterator it = this.matches.getMatches(field);
-        if (it == null || context.field.fieldOptions().maxAnalyzedOffset() == null) {
-            return it;
+
+        Set<String> matchFields = context.field.fieldOptions().matchedFields();
+        if (matchFields == null || matchFields.isEmpty()) {
+            matchFields = Set.of(field);
         }
-        int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5;
-        return new FilterMatchesIterator(it) {
-            @Override
-            public boolean next() throws IOException {
-                if (it.next() == false) {
-                    return false;
-                }
-                return it.startPosition() <= positionCutOff;
+
+        List<MatchesIterator> fieldIterators = new ArrayList<>();
+        for (String field : matchFields) {
+            MatchesIterator it = this.matches.getMatches(field);
+            if (it != null) {
+                fieldIterators.add(it);
             }
-        };
+        }
+        return MatchesUtils.disjunction(fieldIterators);
     }
 
+    /**
+     * Uses a MatchesIterator to highlight a list of source inputs
+     */
     public List<String> buildHighlights(MatchesIterator it, List<CharSequence> sourceValues) throws IOException {
         String contiguousSourceText = buildContiguousSourceText(sourceValues);
         OffsetsRetrievalStrategy offsetsStrategy = getOffsetStrategy();
@@ -93,23 +100,31 @@ private OffsetsRetrievalStrategy getOffsetStrategy() {
                 field,
                 new XOffsetsFromPositions(field, analyzer)
             );
-            case DOCS_AND_FREQS_AND_POSITIONS -> new XOffsetsFromPositions(field, analyzer);
+            case DOCS_AND_FREQS_AND_POSITIONS -> limitOffsets(new XOffsetsFromPositions(field, analyzer));
             case DOCS_AND_FREQS, DOCS ->
-                // By default retrieve offsets from individual tokens
-                // retrieved by the analyzer (possibly narrowed down to
-                // only those terms that the query hinted at when passed
-                // a QueryVisitor.
-                //
-                // Alternative strategies are also possible and may make sense
-                // depending on the use case (OffsetsFromValues, for example).
                 new OffsetsFromTokens(field, analyzer);
-            case NONE -> (matchesIterator, doc) -> {
-                throw new IOException(
-                    "Field is indexed without positions and/or offsets: "
-                        + field
-                        + ", "
-                        + tsi.luceneFieldType().indexOptions());
+            // This should be unreachable because we won't get a MatchesIterator from an unindexed field
+            case NONE -> (matchesIterator, doc) -> { throw new IllegalStateException("Field [ " + field + "] is not indexed"); };
+        };
+    }
+
+    // TODO might be more sensible to push this back into OffsetsFromPositions
+    private OffsetsRetrievalStrategy limitOffsets(OffsetsRetrievalStrategy in) {
+        if (context.field.fieldOptions().maxAnalyzedOffset() == null) {
+            return in;
+        }
+        return (matchesIterator, doc) -> {
+            int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5;
+            MatchesIterator wrapped = new FilterMatchesIterator(matchesIterator) {
+                @Override
+                public boolean next() throws IOException {
+                    if (matchesIterator.next() == false) {
+                        return false;
+                    }
+                    return matchesIterator.startPosition() <= positionCutOff;
+                }
             };
+            return in.get(wrapped, doc);
         };
     }
 
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java
@@ -14,6 +14,9 @@
 import java.io.IOException;
 import java.util.List;
 
+/**
+ * A highlighter that uses the output of a query's Matches to highlight tokens
+ */
 public class MatchesHighlighter implements Highlighter {
 
     private static final String MATCHES_HIGHLIGHTER_CONFIG_KEY = "matches_highlighter_config_key";
@@ -28,7 +31,7 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
 
         MatchesHighlighterState state = (MatchesHighlighterState) fieldContext.cache.computeIfAbsent(
             MATCHES_HIGHLIGHTER_CONFIG_KEY,
-            k -> new MatchesHighlighterState(fieldContext)
+            k -> new MatchesHighlighterState(fieldContext.context.searcher().getIndexReader())
         );
 
         MatchesFieldHighlighter fieldHighlighter = new MatchesFieldHighlighter(fieldContext, state);
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java
@@ -8,6 +8,8 @@
 
 package org.elasticsearch.search.fetch.subphase.highlight;
 
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Matches;
 import org.apache.lucene.search.MatchesIterator;
@@ -22,7 +24,14 @@
 import java.util.Iterator;
 import java.util.Map;
 
-public class MatchesHighlighterState {
+/**
+ * Shared state for the matches highlighter
+ *
+ * This holds two caches, one for the query's Weight which is global across all documents,
+ * and one for Matches for each query, which will be cached per document.  This avoids having
+ * to regenerate the weight and matches for each field being highlighted.
+ */
+class MatchesHighlighterState {
 
     private static final Matches NO_MATCHES = new Matches() {
         @Override
@@ -41,22 +50,23 @@ public Iterator<String> iterator() {
         }
     };
 
-    private final FieldHighlightContext context;
     private final IndexSearcher searcher;
     private final Map<Query, Weight> weightCache = new HashMap<>();
     private final Map<Query, Matches> matchesCache = new HashMap<>();
 
     private int currentDoc = -1;
+    private int currentLeafOrd = -1;
 
-    public MatchesHighlighterState(FieldHighlightContext context) {
-        this.context = context;
-        this.searcher = context.context.searcher();
+    MatchesHighlighterState(IndexReader reader) {
+        this.searcher = new IndexSearcher(reader);
+        this.searcher.setQueryCache(null);  // disable caching
     }
 
-    public Matches getMatches(Query query, int doc) throws IOException {
-        if (currentDoc != doc) {
+    Matches getMatches(Query query, LeafReaderContext ctx, int doc) throws IOException {
+        if (currentDoc != doc || currentLeafOrd != ctx.ord) {
             matchesCache.clear();
             currentDoc = doc;
+            currentLeafOrd = ctx.ord;
         }
         Weight w = weightCache.get(query);
         if (w == null) {
@@ -65,7 +75,7 @@ public Matches getMatches(Query query, int doc) throws IOException {
         }
         Matches m = matchesCache.get(query);
         if (m == null) {
-            m = w.matches(context.hitContext.readerContext(), doc);
+            m = w.matches(ctx, doc);
             if (m == null) {
                 m = NO_MATCHES;
             }
@@ -76,8 +86,4 @@ public Matches getMatches(Query query, int doc) throws IOException {
         }
         return m;
     }
-
-    public MatchesFieldHighlighter getMatchesFieldHighlighter(FieldHighlightContext fieldContext) throws IOException {
-        return new MatchesFieldHighlighter(fieldContext, this);
-    }
 }
diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java
@@ -38,6 +38,30 @@ public void testSimpleTermHighlighting() throws IOException {
         assertHighlights(highlights, "field", "this is <em>some</em> text");
     }
 
+    public void testMultipleFieldHighlighting() throws IOException {
+        MapperService mapperService = createMapperService("""
+            { "_doc" : { "properties" : {
+                "title" : { "type" : "text" },
+                "description" : { "type" : "text" },
+                "category" : { "type" : "keyword" }
+            }}}
+            """);
+
+        ParsedDocument doc = mapperService.documentMapper().parse(source("""
+            { "title" : "A tale of two cities",
+              "description" : "It's a story about two cities",
+              "category" : [ "fiction", "dickens" ] }
+            """));
+
+        SearchSourceBuilder search = new SearchSourceBuilder().query(
+            QueryBuilders.queryStringQuery("dickens OR cities").field("title").field("description").field("category"))
+            .highlighter(new HighlightBuilder().highlighterType("matches").field("title").field("category"));
+
+        Map<String, HighlightField> highlights = highlight(mapperService, doc, search);
+        assertHighlights(highlights, "title", "A tale of two <em>cities</em>");
+        assertHighlights(highlights, "category", "<em>dickens</em>");
+    }
+
     public void testScoring() throws Exception {
 
         MapperService mapperService = createMapperService("""
@@ -105,7 +129,38 @@ public void testAnalyzedOffsetLimit() throws IOException {
         );
     }
 
-    // multiple fields
-    // analyzed offset limit
     // matched_fields - use matches from a set of different fields to highlight this one
+    public void testMatchedFields() throws IOException {
+
+        // note that this doesn't actually use a different analyzer for the subfield,
+        // given restrictions on analyzers in unit tests
+        MapperService mapperService = createMapperService("""
+            { "_doc" : { "properties" : {
+                "description" : {
+                  "type" : "text",
+                  "fields" : {
+                    "stemmed" : { "type" : "text" }
+                  }
+                }
+            }}}
+            """);
+
+        ParsedDocument doc = mapperService.documentMapper().parse(source("""
+            { "description" : "Here is some text" }
+            """));
+
+        HighlightBuilder highlight = new HighlightBuilder()
+            .field(new HighlightBuilder.Field("description").matchedFields("description", "description.stemmed"))
+            .highlighterType("matches");
+        SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.termQuery("description.stemmed", "some"))
+            .highlighter(highlight);
+
+        Map<String, HighlightField> highlights = highlight(mapperService, doc, search);
+        assertHighlights(
+            highlights,
+            "description",
+            "Here is <em>some</em> text"
+        );
+
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -160,10 +160,7 @@ private FieldContext contextBuilders(`
`160`	`160`	`)`
`161`	`161`	`);`
`162`	`162`	`}`
`163`		`- // TODO in future we can load the storedFields in advance here and make use of them,`
`164`		`- // but for now they are loaded separately in HighlightUtils so we only return whether`
`165`		`- // or not we need source.`
`166`		`- storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, Set.of()));`
	`163`	`+ storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, storedFields));`
`167`	`164`	`}`
`168`	`165`	`return new FieldContext(storedFieldsSpec, builders);`
`169`	`166`	`}`