Skip to content

Commit 5f20480

Browse files
committed
Truncate snippets that are longer than requested size
1 parent ff3f3c1 commit 5f20480

File tree

2 files changed

+37
-10
lines changed

2 files changed

+37
-10
lines changed

server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightSnippetUtils.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,6 @@ public static SearchHighlightContext.Field buildFieldHighlightContextForSnippets
4848
SearchHighlightContext.FieldOptions.Builder optionsBuilder = new SearchHighlightContext.FieldOptions.Builder();
4949
optionsBuilder.numberOfFragments(numSnippets);
5050
optionsBuilder.fragmentCharSize(snippetCharLength);
51-
// Note: The default SENTENCE boundary scanner used by the DefaultHighlighter will return fragments larger than the specified
52-
// snippetLength. This has implications when appending and calculating ByteArrays, so we specify WORD.
53-
optionsBuilder.boundaryScannerType(HighlightBuilder.BoundaryScannerType.WORD);
5451
optionsBuilder.noMatchSize(snippetCharLength);
5552
optionsBuilder.preTags(new String[] { "" });
5653
optionsBuilder.postTags(new String[] { "" });

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/HighlighterExpressionEvaluator.java

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@
3939

4040
import java.io.IOException;
4141
import java.io.UncheckedIOException;
42+
import java.nio.ByteBuffer;
43+
import java.nio.CharBuffer;
44+
import java.nio.charset.CharacterCodingException;
45+
import java.nio.charset.CharsetDecoder;
46+
import java.nio.charset.CodingErrorAction;
47+
import java.nio.charset.StandardCharsets;
4248
import java.util.Collections;
4349
import java.util.HashMap;
4450
import java.util.Map;
@@ -51,8 +57,8 @@ public class HighlighterExpressionEvaluator extends LuceneQueryEvaluator<BytesRe
5157
EvalOperator.ExpressionEvaluator {
5258

5359
private final String fieldName;
54-
private final Integer numFragments;
55-
private final Integer fragmentLength;
60+
private final int numFragments;
61+
private final int fragmentLength;
5662
private final Map<String, Highlighter> highlighters;
5763
private final FetchContext fetchContext;
5864
private final MappedFieldType fieldType;
@@ -68,8 +74,8 @@ public class HighlighterExpressionEvaluator extends LuceneQueryEvaluator<BytesRe
6874
) {
6975
super(blockFactory, shardConfigs);
7076
this.fieldName = fieldName;
71-
this.numFragments = numFragments;
72-
this.fragmentLength = fragmentLength;
77+
this.numFragments = numFragments != null ? numFragments : HighlightBuilder.DEFAULT_NUMBER_OF_FRAGMENTS;
78+
this.fragmentLength = fragmentLength != null ? fragmentLength : HighlightBuilder.DEFAULT_FRAGMENT_CHAR_SIZE;
7379
this.highlighters = highlighters;
7480

7581
// Create a source loader for highlighter use
@@ -110,8 +116,8 @@ protected void appendMatch(BytesRefBlock.Builder builder, Scorable scorer, int d
110116
SearchHighlightContext.Field field = HighlightSnippetUtils.buildFieldHighlightContextForSnippets(
111117
fetchContext.getSearchExecutionContext(),
112118
fieldName,
113-
numFragments != null ? numFragments : HighlightBuilder.DEFAULT_NUMBER_OF_FRAGMENTS,
114-
fragmentLength != null ? fragmentLength : HighlightBuilder.DEFAULT_FRAGMENT_CHAR_SIZE,
119+
numFragments,
120+
fragmentLength,
115121
query
116122
);
117123
FetchSubPhase.HitContext hitContext = new FetchSubPhase.HitContext(searchHit, leafReaderContext, docId, Map.of(), source, null);
@@ -132,14 +138,38 @@ protected void appendMatch(BytesRefBlock.Builder builder, Scorable scorer, int d
132138
builder.beginPositionEntry();
133139
}
134140
for (Text highlightText : highlight.fragments()) {
135-
builder.appendBytesRef(new BytesRef(highlightText.bytes().bytes()));
141+
byte[] highlightBytes = highlightText.bytes().bytes();
142+
if (highlightBytes.length > fragmentLength) {
143+
// TODO - This isn't a great solution, but in order to resolve character encoding issues in the
144+
// returned BytesRef we need to ensure that the fragment size we return is equal to what was requested.
145+
// Since the highlighter's default sentence boundary scanner can return longer fragments, we're truncating for now.
146+
byte[] truncatedBytes = truncateUtf8(highlightBytes, fragmentLength);
147+
builder.appendBytesRef(new BytesRef(truncatedBytes));
148+
} else {
149+
builder.appendBytesRef(new BytesRef(highlightBytes));
150+
}
136151
}
137152
if (multivalued) {
138153
builder.endPositionEntry();
139154
}
140155
}
141156
}
142157

158+
private static byte[] truncateUtf8(byte[] bytes, int maxLength) throws CharacterCodingException {
159+
if (bytes.length <= maxLength) return bytes;
160+
161+
CharsetDecoder dec = StandardCharsets.UTF_8.newDecoder()
162+
.onMalformedInput(CodingErrorAction.IGNORE)
163+
.onUnmappableCharacter(CodingErrorAction.IGNORE);
164+
165+
CharBuffer chars = dec.decode(ByteBuffer.wrap(bytes, 0, maxLength));
166+
ByteBuffer out = StandardCharsets.UTF_8.encode(chars);
167+
168+
byte[] result = new byte[out.remaining()];
169+
out.get(result);
170+
return result;
171+
}
172+
143173
private static Supplier<Source> lazyStoredSourceLoader(LeafReaderContext ctx, int doc) {
144174
return () -> {
145175
StoredFieldLoader rootLoader = StoredFieldLoader.create(true, Collections.emptySet());

0 commit comments

Comments
 (0)