diff --git a/docs/changelog/92068.yaml b/docs/changelog/92068.yaml new file mode 100644 index 0000000000000..cee8e23c6c4e2 --- /dev/null +++ b/docs/changelog/92068.yaml @@ -0,0 +1,5 @@ +pr: 92068 +summary: Add a new highlighter based on lucene's match highlighter +area: Highlighting +type: feature +issues: [] diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index c5fdbc5a3973e..7dbb0379f3534 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -64,6 +64,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -106,7 +107,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { // TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests - private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified" }; + private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified", "matches" }; @Override protected Collection> nodePlugins() { @@ -3508,6 +3509,9 @@ public void testWithNestedQuery() throws Exception { // but we highlight the root text field since nested documents cannot be highlighted with postings nor term vectors // directly. for (String type : ALL_TYPES) { + if (Objects.equals("matches", type)) { + continue; // matches highlighter doesn't support nested fields + } SearchResponse searchResponse = client().prepareSearch() .setQuery(nestedQuery("foo", prefixQuery("foo.text", "bro"), ScoreMode.None)) .highlighter(new HighlightBuilder().field(new Field("text").highlighterType(type).requireFieldMatch(false))) diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index fa2699a30b897..95ab45e324186 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -219,6 +219,7 @@ import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase; import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; +import org.elasticsearch.search.fetch.subphase.highlight.MatchesHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter; import org.elasticsearch.search.internal.ShardSearchRequest; @@ -863,6 +864,7 @@ private static Map setupHighlighters(Settings settings, Lis highlighters.register("fvh", new FastVectorHighlighter(settings)); highlighters.register("plain", new PlainHighlighter()); highlighters.register("unified", new UnifiedHighlighter()); + highlighters.register("matches", new MatchesHighlighter()); highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters); return unmodifiableMap(highlighters.getRegistry()); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java index d4b5234f4e0b2..df1be7092da9b 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java @@ -54,6 +54,10 @@ public HighlightField(String name, Text[] fragments) { this.fragments = fragments; } + public HighlightField(String name, List fragments) { + this(name, fragments.stream().map(Text::new).toArray(Text[]::new)); + } + /** * The name of the field highlighted. */ diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java index 4b60d32d3065f..91846c4b609ef 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java @@ -40,7 +40,6 @@ public FetchSubPhaseProcessor getProcessor(FetchContext context) { if (context.highlight() == null) { return null; } - return getProcessor(context, context.highlight(), context.parsedQuery().query()); } @@ -161,10 +160,7 @@ private FieldContext contextBuilders( ) ); } - // TODO in future we can load the storedFields in advance here and make use of them, - // but for now they are loaded separately in HighlightUtils so we only return whether - // or not we need source. - storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, Set.of())); + storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, storedFields)); } return new FieldContext(storedFieldsSpec, builders); } diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java index 1bf127df3d403..f8166077859cc 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java @@ -10,7 +10,6 @@ import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.SimpleHTMLEncoder; -import org.elasticsearch.index.fieldvisitor.CustomFieldsVisitor; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.ValueFetcher; import org.elasticsearch.index.query.SearchExecutionContext; @@ -18,11 +17,7 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Objects; - -import static java.util.Collections.singleton; public final class HighlightUtils { @@ -43,11 +38,8 @@ public static List loadFieldValues( FetchSubPhase.HitContext hitContext, boolean forceSource ) throws IOException { - if (forceSource == false && fieldType.isStored()) { - CustomFieldsVisitor fieldVisitor = new CustomFieldsVisitor(singleton(fieldType.name()), false); - hitContext.reader().document(hitContext.docId(), fieldVisitor); - List textsToHighlight = fieldVisitor.fields().get(fieldType.name()); - return Objects.requireNonNullElse(textsToHighlight, Collections.emptyList()); + if (forceSource == false && hitContext.loadedFields().containsKey(fieldType.name())) { + return hitContext.loadedFields().get(fieldType.name()); } ValueFetcher fetcher = fieldType.valueFetcher(searchContext, null); fetcher.setNextReader(hitContext.readerContext()); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java new file mode 100644 index 0000000000000..c80da441de3f1 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java @@ -0,0 +1,153 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.FilterMatchesIterator; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.MatchesUtils; +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.OffsetsFromTokens; +import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; +import org.apache.lucene.search.matchhighlight.Passage; +import org.apache.lucene.search.matchhighlight.PassageFormatter; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.index.mapper.TextSearchInfo; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +/** + * Highlights individual fields using components from lucene's match highlighter + */ +class MatchesFieldHighlighter { + + private final FieldHighlightContext context; + private final Matches matches; + private final Analyzer analyzer; + private final String field; + + MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException { + this.context = context; + // TODO term vectors and require_field_match=false should intercept things here + this.matches = state.getMatches(context.query, context.hitContext.readerContext(), context.hitContext.docId()); + this.analyzer = context.context.getSearchExecutionContext().getIndexAnalyzer(s -> Lucene.STANDARD_ANALYZER); + this.field = context.fieldType.name(); + } + + /** + * @return a MatchesIterator for this field, based on the field highlighter configuration + */ + MatchesIterator getMatchesIterator() throws IOException { + if (this.matches == null) { + return null; + } + + Set matchFields = context.field.fieldOptions().matchedFields(); + if (matchFields == null || matchFields.isEmpty()) { + matchFields = Set.of(field); + } + + List fieldIterators = new ArrayList<>(); + for (String field : matchFields) { + MatchesIterator it = this.matches.getMatches(field); + if (it != null) { + fieldIterators.add(it); + } + } + return MatchesUtils.disjunction(fieldIterators); + } + + /** + * Uses a MatchesIterator to highlight a list of source inputs + */ + public List buildHighlights(MatchesIterator it, List sourceValues) throws IOException { + String contiguousSourceText = buildContiguousSourceText(sourceValues); + OffsetsRetrievalStrategy offsetsStrategy = getOffsetStrategy(); + List matchRanges = offsetsStrategy.get(it, f -> sourceValues); + List sourceRanges = computeValueRanges(sourceValues); + XPassageSelector passageSelector = new XPassageSelector(); // TODO word break stuff goes here + PassageFormatter formatter = new PassageFormatter( + "...", + context.field.fieldOptions().preTags()[0], + context.field.fieldOptions().postTags()[0] + ); // TODO multiple field markers a la FVH + List passages = passageSelector.pickBest( + contiguousSourceText, + matchRanges, + context.field.fieldOptions().fragmentCharSize(), + context.field.fieldOptions().numberOfFragments(), + sourceRanges + ); + return formatter.format(contiguousSourceText, passages, sourceRanges); + } + + private OffsetsRetrievalStrategy getOffsetStrategy() { + TextSearchInfo tsi = context.fieldType.getTextSearchInfo(); + // TODO termvectors + return switch (tsi.luceneFieldType().indexOptions()) { + case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS -> new XOffsetsFromMatchIterator( + field, + new XOffsetsFromPositions(field, analyzer) + ); + case DOCS_AND_FREQS_AND_POSITIONS -> limitOffsets(new XOffsetsFromPositions(field, analyzer)); + case DOCS_AND_FREQS, DOCS -> new OffsetsFromTokens(field, analyzer); + // This should be unreachable because we won't get a MatchesIterator from an unindexed field + case NONE -> (matchesIterator, doc) -> { throw new IllegalStateException("Field [ " + field + "] is not indexed"); }; + }; + } + + // TODO might be more sensible to push this back into OffsetsFromPositions + private OffsetsRetrievalStrategy limitOffsets(OffsetsRetrievalStrategy in) { + if (context.field.fieldOptions().maxAnalyzedOffset() == null) { + return in; + } + return (matchesIterator, doc) -> { + int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5; + MatchesIterator wrapped = new FilterMatchesIterator(matchesIterator) { + @Override + public boolean next() throws IOException { + if (matchesIterator.next() == false) { + return false; + } + return matchesIterator.startPosition() <= positionCutOff; + } + }; + return in.get(wrapped, doc); + }; + } + + private String buildContiguousSourceText(List values) { + String value; + if (values.size() == 1) { + value = values.get(0).toString(); + } else { + // TODO: This can be inefficient if offset gap is large but the logic + // of applying offsets would get much more complicated so leaving for now + // (would have to recalculate all offsets to omit gaps). + String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field)); + value = String.join(fieldGapPadding, values); + } + return value; + } + + private List computeValueRanges(List values) { + ArrayList valueRanges = new ArrayList<>(); + int offset = 0; + for (CharSequence v : values) { + valueRanges.add(new OffsetRange(offset, offset + v.length())); + offset += v.length(); + offset += analyzer.getOffsetGap(field); + } + return valueRanges; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java new file mode 100644 index 0000000000000..a03ed7710fb69 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.MatchesIterator; +import org.elasticsearch.index.mapper.MappedFieldType; + +import java.io.IOException; +import java.util.List; + +/** + * A highlighter that uses the output of a query's Matches to highlight tokens + */ +public class MatchesHighlighter implements Highlighter { + + private static final String MATCHES_HIGHLIGHTER_CONFIG_KEY = "matches_highlighter_config_key"; + + @Override + public boolean canHighlight(MappedFieldType fieldType) { + return true; + } + + @Override + public HighlightField highlight(FieldHighlightContext fieldContext) throws IOException { + + MatchesHighlighterState state = (MatchesHighlighterState) fieldContext.cache.computeIfAbsent( + MATCHES_HIGHLIGHTER_CONFIG_KEY, + k -> new MatchesHighlighterState(fieldContext.context.searcher().getIndexReader()) + ); + + MatchesFieldHighlighter fieldHighlighter = new MatchesFieldHighlighter(fieldContext, state); + + MatchesIterator it = fieldHighlighter.getMatchesIterator(); + if (it == null) { + return null; + } + + List sourceValues = HighlightUtils.loadFieldValues( + fieldContext.fieldType, + fieldContext.context.getSearchExecutionContext(), + fieldContext.hitContext, + fieldContext.forceSource + ).stream().map(v -> (CharSequence) v.toString()).toList(); + + List highlights = fieldHighlighter.buildHighlights(it, sourceValues); + return new HighlightField(fieldContext.fieldName, highlights); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java new file mode 100644 index 0000000000000..3a5ddf6de69dd --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java @@ -0,0 +1,89 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Weight; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +/** + * Shared state for the matches highlighter + * + * This holds two caches, one for the query's Weight which is global across all documents, + * and one for Matches for each query, which will be cached per document. This avoids having + * to regenerate the weight and matches for each field being highlighted. + */ +class MatchesHighlighterState { + + private static final Matches NO_MATCHES = new Matches() { + @Override + public MatchesIterator getMatches(String field) { + return null; + } + + @Override + public Collection getSubMatches() { + return Collections.emptyList(); + } + + @Override + public Iterator iterator() { + return Collections.emptyIterator(); + } + }; + + private final IndexSearcher searcher; + private final Map weightCache = new HashMap<>(); + private final Map matchesCache = new HashMap<>(); + + private int currentDoc = -1; + private int currentLeafOrd = -1; + + MatchesHighlighterState(IndexReader reader) { + this.searcher = new IndexSearcher(reader); + this.searcher.setQueryCache(null); // disable caching + } + + Matches getMatches(Query query, LeafReaderContext ctx, int doc) throws IOException { + if (currentDoc != doc || currentLeafOrd != ctx.ord) { + matchesCache.clear(); + currentDoc = doc; + currentLeafOrd = ctx.ord; + } + Weight w = weightCache.get(query); + if (w == null) { + w = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1); + weightCache.put(query, w); + } + Matches m = matchesCache.get(query); + if (m == null) { + m = w.matches(ctx, doc); + if (m == null) { + m = NO_MATCHES; + } + matchesCache.put(query, m); + } + if (m == NO_MATCHES) { + return null; + } + return m; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java new file mode 100644 index 0000000000000..74b1e49939ddd --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java @@ -0,0 +1,79 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.OffsetsFromPositions; +import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; +import org.elasticsearch.Version; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * This strategy retrieves offsets directly from {@link MatchesIterator}, if they are available, + * otherwise it falls back to using {@link OffsetsFromPositions}. + */ +public final class XOffsetsFromMatchIterator implements OffsetsRetrievalStrategy { + private final String field; + private final XOffsetsFromPositions noOffsetsFallback; + + XOffsetsFromMatchIterator(String field, XOffsetsFromPositions noOffsetsFallback) { + this.field = field; + this.noOffsetsFallback = Objects.requireNonNull(noOffsetsFallback); + // https://github.com/apache/lucene/pull/11983 + assert Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(9, 5, 0)) == false; + } + + @Override + public List get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException { + ArrayList positionRanges = new ArrayList<>(); + ArrayList offsetRanges = new ArrayList<>(); + while (matchesIterator.next()) { + int fromPosition = matchesIterator.startPosition(); + int toPosition = matchesIterator.endPosition(); + if (fromPosition < 0 || toPosition < 0) { + throw new IOException("Matches API returned negative positions for field: " + field); + } + positionRanges.add(new OffsetRange(fromPosition, toPosition)); + + if (offsetRanges != null) { + int from = matchesIterator.startOffset(); + int to = matchesIterator.endOffset(); + if (from < 0 || to < 0) { + // At least one offset isn't available. Fallback to just positions. + offsetRanges = null; + } else { + offsetRanges.add(new OffsetRange(from, to)); + } + } + } + + // Use the fallback conversion from positions if not all offsets were available. + if (offsetRanges == null) { + return noOffsetsFallback.convertPositionsToOffsets(positionRanges, doc.getValues(field)); + } else { + return offsetRanges; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java new file mode 100644 index 0000000000000..40821ae5cad1c --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java @@ -0,0 +1,158 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; +import org.elasticsearch.Version; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public final class XOffsetsFromPositions implements OffsetsRetrievalStrategy { + private final String field; + private final Analyzer analyzer; + + public XOffsetsFromPositions(String field, Analyzer analyzer) { + this.field = field; + this.analyzer = analyzer; + // https://github.com/apache/lucene/pull/11983 + assert Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(9, 5, 0)) == false; + } + + @Override + public List get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException { + ArrayList positionRanges = new ArrayList<>(); + while (matchesIterator.next()) { + int from = matchesIterator.startPosition(); + int to = matchesIterator.endPosition(); + if (from < 0 || to < 0) { + throw new IOException("Matches API returned negative positions for field: " + field); + } + positionRanges.add(new OffsetRange(from, to)); + } + + // Convert from positions to offsets. + return convertPositionsToOffsets(positionRanges, doc.getValues(field)); + } + + List convertPositionsToOffsets(ArrayList positionRanges, List values) throws IOException { + + if (positionRanges.isEmpty()) { + return positionRanges; + } + + class PositionSpan extends OffsetRange { + int leftOffset = Integer.MAX_VALUE; + int rightOffset = Integer.MIN_VALUE; + + PositionSpan(int from, int to) { + super(from, to); + } + + @Override + public String toString() { + return "[from=" + from + ", to=" + to + ", L: " + leftOffset + ", R: " + rightOffset + ']'; + } + } + + List spans = new ArrayList<>(); + int minPosition = Integer.MAX_VALUE; + int maxPosition = Integer.MIN_VALUE; + for (OffsetRange range : positionRanges) { + spans.add(new PositionSpan(range.from, range.to)); + minPosition = Math.min(minPosition, range.from); + maxPosition = Math.max(maxPosition, range.to); + } + + PositionSpan[] spansTable = spans.toArray(PositionSpan[]::new); + int spanCount = spansTable.length; + int position = -1; + int valueOffset = 0; + int convertedMatchCount = 0; + for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) { + final String value = values.get(valueIndex).toString(); + final boolean lastValue = valueIndex + 1 == max; + + TokenStream ts = analyzer.tokenStream(field, value); + OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class); + PositionIncrementAttribute posAttr = ts.getAttribute(PositionIncrementAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + position += posAttr.getPositionIncrement(); + + if (position >= minPosition) { + // Correct left and right offsets for each span this position applies to. + int startOffset = valueOffset + offsetAttr.startOffset(); + int endOffset = valueOffset + offsetAttr.endOffset(); + + int j = 0; + for (int i = 0; i < spanCount; i++) { + PositionSpan span = spansTable[j] = spansTable[i]; + if (position >= span.from) { + if (position <= span.to) { + span.leftOffset = Math.min(span.leftOffset, startOffset); + span.rightOffset = Math.max(span.rightOffset, endOffset); + convertedMatchCount++; + } else { + // this span can't intersect with any following position + // so omit it by skipping j++. + continue; + } + } + j++; + } + spanCount = j; + + // Only short-circuit if we're on the last value (which should be the common + // case since most fields would only have a single value anyway). We need + // to make sure of this because otherwise offsetAttr would have incorrect value. + if (position > maxPosition && lastValue) { + break; + } + } + } + ts.end(); + position += posAttr.getPositionIncrement() + analyzer.getPositionIncrementGap(field); + valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(field); + ts.close(); + } + + if (convertedMatchCount == 0) { + return Collections.emptyList(); + } + spans = spans.subList(0, convertedMatchCount); // remove any matches beyond the max analyzed offset limit + ArrayList converted = new ArrayList<>(spans.size()); + for (PositionSpan span : spans) { + if (span.leftOffset == Integer.MAX_VALUE || span.rightOffset == Integer.MIN_VALUE) { + throw new RuntimeException("One of the offsets missing for position range: " + span); + } + converted.add(new OffsetRange(span.leftOffset, span.rightOffset)); + } + return converted; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java new file mode 100644 index 0000000000000..b76337cbeb354 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java @@ -0,0 +1,313 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.Passage; +import org.apache.lucene.search.matchhighlight.PassageAdjuster; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.PriorityQueue; +import org.elasticsearch.Version; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.RandomAccess; + +public class XPassageSelector { + + public static final Comparator DEFAULT_SCORER = (a, b) -> { + // Compare the number of highlights first. + int v; + v = Integer.compare(a.markers.size(), b.markers.size()); + if (v != 0) { + return v; + } + + // Total number of characters covered by the highlights. + int len1 = 0, len2 = 0; + for (OffsetRange o : a.markers) { + len1 += o.length(); + } + for (OffsetRange o : b.markers) { + len2 += o.length(); + } + if (len1 != len2) { + return Integer.compare(len1, len2); + } + + return Integer.compare(b.from, a.from); + }; + + private final Comparator passageScorer; + private final PassageAdjuster passageAdjuster; + + public XPassageSelector() { + this(DEFAULT_SCORER, null); + } + + public XPassageSelector(Comparator passageScorer, PassageAdjuster passageAdjuster) { + this.passageScorer = passageScorer; + this.passageAdjuster = passageAdjuster; + // https://github.com/apache/lucene/pull/11990 + assert Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(9, 5, 0)) == false; + } + + public List pickBest(CharSequence value, List markers, int maxPassageWindow, int maxPassages) { + return pickBest(value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length()))); + } + + public List pickBest( + CharSequence value, + List markers, + int maxPassageWindow, + int maxPassages, + List permittedPassageRanges + ) { + assert markers instanceof RandomAccess; + assert permittedPassageRanges instanceof RandomAccess; + assert sortedAndNonOverlapping(permittedPassageRanges); + + // Handle odd special cases early. + if (value.length() == 0 || maxPassageWindow == 0) { + return Collections.emptyList(); + } + + // Best passages so far. + PriorityQueue pq = new PriorityQueue<>(markers.size()) { + @Override + protected boolean lessThan(Passage a, Passage b) { + return passageScorer.compare(a, b) < 0; + } + }; + + markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges); + + // Sort markers by their start offset, shortest first. + markers.sort(Comparator.comparingInt(r -> r.from).thenComparingInt(r -> r.to)); + + final int max = markers.size(); + int markerIndex = 0; + nextRange: for (OffsetRange range : permittedPassageRanges) { + final int rangeTo = Math.min(range.to, value.length()); + + // Skip ranges outside of the value window anyway. + if (range.from >= rangeTo) { + continue; + } + + while (markerIndex < max) { + OffsetRange m = markers.get(markerIndex); + + // Markers are sorted so if the current marker's start is past the range, + // we can advance, but we need to check the same marker against the new range. + if (m.from >= rangeTo) { + continue nextRange; + } + + // Check if current marker falls within the range and is smaller than the largest allowed + // passage window. + if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) { + + // Adjust the window range to center the highlight marker. + int from = Math.toIntExact(((long) m.from + m.to - maxPassageWindow) / 2); + int to = Math.toIntExact(((long) m.from + m.to + maxPassageWindow) / 2); + if (from < range.from) { + to += range.from - from; + from = range.from; + } + if (to > rangeTo) { + from -= to - rangeTo; + to = rangeTo; + if (from < range.from) { + from = range.from; + } + } + + if (from < to && to <= value.length()) { + // Find other markers that are completely inside the passage window. + ArrayList inside = new ArrayList<>(); + int i = markerIndex; + while (i > 0 && markers.get(i - 1).from >= from) { + i--; + } + + OffsetRange c; + for (; i < max && (c = markers.get(i)).from < to; i++) { + if (c.to <= to) { + inside.add(c); + } + } + + if (inside.isEmpty() == false) { + pq.insertWithOverflow(new Passage(from, to, inside)); + } + } + } + + // Advance to the next marker. + markerIndex++; + } + } + + // Collect from the priority queue (reverse the order so that highest-scoring are first). + Passage[] passages; + if (pq.size() > 0) { + passages = new Passage[pq.size()]; + for (int i = pq.size(); --i >= 0;) { + passages[i] = pq.pop(); + } + } else { + // Handle the default, no highlighting markers case. + passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges); + } + + // Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're + // on a proper word/sentence boundary. + if (passageAdjuster != null) { + passageAdjuster.currentValue(value); + for (int x = 0; x < passages.length; x++) { + Passage p = passages[x]; + OffsetRange newRange = passageAdjuster.adjust(p); + if (newRange.from != p.from || newRange.to != p.to) { + assert newRange.from >= p.from && newRange.to <= p.to + : "Adjusters must not expand the passage's range: was " + p + " => changed to " + newRange; + passages[x] = new Passage(newRange.from, newRange.to, p.markers); + } + } + } + + // Ensure there are no overlaps on passages. In case of conflicts, better score wins. + int last = 0; + for (int i = 0; i < passages.length; i++) { + Passage a = passages[i]; + if (a != null && a.length() > 0) { + passages[last++] = a; + for (int j = i + 1; j < passages.length; j++) { + Passage b = passages[j]; + if (b != null) { + if (adjecentOrOverlapping(a, b)) { + passages[j] = null; + } + } + } + } + } + + // Remove nullified slots. + last = Math.min(last, maxPassages); + if (passages.length != last) { + passages = ArrayUtil.copyOfSubArray(passages, 0, last); + } + + // Sort in the offset order again. + Arrays.sort(passages, Comparator.comparingInt(a -> a.from)); + + return Arrays.asList(passages); + } + + /** Truncate or split highlight markers that cross permitted value boundaries. */ + private List splitOrTruncateToWindows( + List markers, + int maxPassageWindow, + List permittedPassageRanges + ) { + // Process markers overlapping with each permitted window. + ArrayList processedMarkers = new ArrayList<>(markers.size()); + for (OffsetRange marker : markers) { + for (OffsetRange permitted : permittedPassageRanges) { + boolean newSlice = false; + + int from = marker.from; + if (from < permitted.from) { + from = permitted.from; + newSlice = true; + } + + int to = marker.to; + if (to > permitted.to) { + to = permitted.to; + newSlice = true; + } + + if (from >= to) { + continue; + } + + if (to - from > maxPassageWindow) { + to = from + maxPassageWindow; + newSlice = true; + } + + processedMarkers.add(newSlice ? marker.slice(from, to) : marker); + } + } + markers = processedMarkers; + return markers; + } + + static boolean sortedAndNonOverlapping(List permittedPassageRanges) { + if (permittedPassageRanges.size() > 1) { + Iterator i = permittedPassageRanges.iterator(); + for (OffsetRange next, previous = i.next(); i.hasNext(); previous = next) { + next = i.next(); + if (previous.to > next.from) { + throw new AssertionError("Ranges must be sorted and non-overlapping: " + permittedPassageRanges); + } + } + } + + return true; + } + + /** + * Invoked when no passages could be selected (due to constraints or lack of highlight markers). + */ + protected Passage[] pickDefaultPassage( + CharSequence value, + int maxCharacterWindow, + int maxPassages, + List permittedPassageRanges + ) { + // Search for the first range that is not empty. + ArrayList defaultPassages = new ArrayList<>(); + for (OffsetRange o : permittedPassageRanges) { + if (defaultPassages.size() >= maxPassages) { + break; + } + + int to = Math.min(value.length(), o.to); + if (o.from < to) { + defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())); + } + } + + return defaultPassages.toArray(Passage[]::new); + } + + private static boolean adjecentOrOverlapping(Passage a, Passage b) { + if (a.from >= b.from) { + return a.from <= b.to - 1; + } else { + return a.to - 1 >= b.from; + } + } +} diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java index f174ae9180522..1183879f33a8f 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import static org.elasticsearch.test.EqualsHashCodeTestUtils.checkEqualsAndHashCode; @@ -80,7 +81,7 @@ public void testToXContent() throws IOException { ] }""", Strings.toString(builder)); - field = new HighlightField("foo", null); + field = new HighlightField("foo", Collections.emptyList()); builder = JsonXContent.contentBuilder(); builder.prettyPrint(); builder.startObject(); @@ -88,7 +89,7 @@ public void testToXContent() throws IOException { builder.endObject(); assertEquals(""" { - "foo" : null + "foo" : [ ] }""", Strings.toString(builder)); } diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java new file mode 100644 index 0000000000000..0968c213a2d3b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java @@ -0,0 +1,156 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.fetch.HighlighterTestCase; + +import java.io.IOException; +import java.util.Map; + +public class MatchesHighlighterTests extends HighlighterTestCase { + + public void testSimpleTermHighlighting() throws IOException { + + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "field" : { "type" : "text" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { "field" : "this is some text" } + """)); + + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.termQuery("field", "some")) + .highlighter(new HighlightBuilder().field("field").highlighterType("matches")); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights(highlights, "field", "this is some text"); + } + + public void testMultipleFieldHighlighting() throws IOException { + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "title" : { "type" : "text" }, + "description" : { "type" : "text" }, + "category" : { "type" : "keyword" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { "title" : "A tale of two cities", + "description" : "It's a story about two cities", + "category" : [ "fiction", "dickens" ] } + """)); + + SearchSourceBuilder search = new SearchSourceBuilder().query( + QueryBuilders.queryStringQuery("dickens OR cities").field("title").field("description").field("category") + ).highlighter(new HighlightBuilder().highlighterType("matches").field("title").field("category")); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights(highlights, "title", "A tale of two cities"); + assertHighlights(highlights, "category", "dickens"); + } + + public void testScoring() throws Exception { + + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "description" : { "type" : "text" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { + "description": [ + "Lorem Ipsum string Generator that helps to create dummy text for all layout needs.", + "It has roots in a string of classical Latin literature from 45 BC, making it search string over 2000 years old." + ] + } + """)); + + HighlightBuilder highlight = new HighlightBuilder().field("description") + .highlighterType("matches") + .numOfFragments(2) + .fragmentSize(50); + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.matchQuery("description", "search string")) + .highlighter(highlight); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights( + highlights, + "description", + "Lorem Ipsum string Generator that helps to create ...", + "...from 45 BC, making it search string over 2000 year..." + ); + } + + public void testAnalyzedOffsetLimit() throws IOException { + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "description" : { "type" : "text" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { + "description": [ + "Lorem Ipsum string Generator that helps to create dummy text for all layout needs.", + "It has roots in a string of classical Latin literature from 45 BC, making it search string over 2000 years old." + ] + } + """)); + + HighlightBuilder highlight = new HighlightBuilder().field("description") + .highlighterType("matches") + .maxAnalyzedOffset(70) + .numOfFragments(2) + .fragmentSize(50); + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.matchQuery("description", "search string")) + .highlighter(highlight); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights(highlights, "description", "Lorem Ipsum string Generator that helps to create ..."); + } + + // matched_fields - use matches from a set of different fields to highlight this one + public void testMatchedFields() throws IOException { + + // note that this doesn't actually use a different analyzer for the subfield, + // given restrictions on analyzers in unit tests + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "description" : { + "type" : "text", + "fields" : { + "stemmed" : { "type" : "text" } + } + } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { "description" : "Here is some text" } + """)); + + HighlightBuilder highlight = new HighlightBuilder().field( + new HighlightBuilder.Field("description").matchedFields("description", "description.stemmed") + ).highlighterType("matches"); + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.termQuery("description.stemmed", "some")) + .highlighter(highlight); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights(highlights, "description", "Here is some text"); + + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java index 11bbdbc9b1013..e1ed7c2aa33d9 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java @@ -9,7 +9,9 @@ package org.elasticsearch.search.fetch; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.similarities.BM25Similarity; import org.elasticsearch.common.text.Text; +import org.elasticsearch.index.cache.query.TrivialQueryCachingPolicy; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MapperServiceTestCase; import org.elasticsearch.index.mapper.ParsedDocument; @@ -21,8 +23,10 @@ import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase; import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; +import org.elasticsearch.search.fetch.subphase.highlight.MatchesHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter; +import org.elasticsearch.search.internal.ContextIndexSearcher; import org.elasticsearch.search.lookup.Source; import java.io.IOException; @@ -44,7 +48,9 @@ protected Map getHighlighters() { "fvh", new FastVectorHighlighter(getIndexSettings()), "plain", - new PlainHighlighter() + new PlainHighlighter(), + "matches", + new MatchesHighlighter() ); } @@ -91,6 +97,14 @@ private static FetchContext fetchContext(SearchExecutionContext context, SearchS when(fetchContext.parsedQuery()).thenReturn(new ParsedQuery(search.query().toQuery(context))); when(fetchContext.getSearchExecutionContext()).thenReturn(context); when(fetchContext.sourceLoader()).thenReturn(context.newSourceLoader(false)); + ContextIndexSearcher cis = new ContextIndexSearcher( + context.searcher().getIndexReader(), + new BM25Similarity(), + null, + TrivialQueryCachingPolicy.NEVER, + false + ); + when(fetchContext.searcher()).thenReturn(cis); return fetchContext; } }