Skip to content

Commit 5033b3a

Browse files
committed
Extend longer snippets to best scoring near chunk rather than just concatenating to the next chunk
1 parent b13a1d2 commit 5033b3a

File tree

1 file changed

+45
-19
lines changed

1 file changed

+45
-19
lines changed

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
104104
? 1 // we return the best fragment by default
105105
: fieldContext.field.fieldOptions().numberOfFragments();
106106

107+
// TODO: Right now this will default to 100 if not set. If the user does not set fragmentSize for
108+
// the semantic highlighter, we probably just want to return unadulterated chunks instead.
107109
int fragmentCharSize = fieldContext.field.fieldOptions().fragmentCharSize();
108110

109111
List<OffsetAndScore> chunks = extractOffsetAndScores(
@@ -113,7 +115,7 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
113115
fieldContext.hitContext.docId(),
114116
queries
115117
);
116-
if (chunks.size() == 0) {
118+
if (chunks.isEmpty()) {
117119
return null;
118120
}
119121

@@ -170,30 +172,54 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
170172
}
171173
consumedChunks[i] = true;
172174

173-
// Chunks smaller than the requested fragmentCharSize will be concatenated with neighboring chunks
174175
if (fragmentCharSize > 0 && content.length() < fragmentCharSize) {
175-
StringBuilder concatenated = new StringBuilder(content);
176-
177-
// Look ahead to find more chunks to concatenate
178-
// TODO: Lookback to get the preceding chunk if we're at the end of the document
179-
for (int nextIndex = i + 1; nextIndex < chunks.size() && concatenated.length() < fragmentCharSize; nextIndex++) {
180-
if (consumedChunks[nextIndex]) {
181-
continue;
176+
var cur = chunk.offset();
177+
if (cur != null) {
178+
int prevIdx = -1, nextIdx = -1;
179+
long bestPrevEnd = Long.MIN_VALUE; // nearest previous by largest end()
180+
long bestNextStart = Long.MAX_VALUE; // nearest next by smallest start()
181+
182+
for (int j = 0; j < chunks.size(); j++) {
183+
if (j == i || consumedChunks[j]) continue;
184+
185+
var cand = chunks.get(j);
186+
var off = cand.offset();
187+
if (off == null) continue;
188+
189+
if (off.end() <= cur.start()) {
190+
// candidate is before current
191+
if (off.end() > bestPrevEnd) {
192+
bestPrevEnd = off.end();
193+
prevIdx = j;
194+
}
195+
} else if (off.start() >= cur.end()) {
196+
// candidate is after current
197+
if (off.start() < bestNextStart) {
198+
bestNextStart = off.start();
199+
nextIdx = j;
200+
}
201+
}
182202
}
183203

184-
var nextChunk = chunks.get(nextIndex);
185-
if (nextChunk.offset().start() > chunk.offset().end()) {
186-
String nextContent = offsetToContent.apply(nextChunk);
187-
if (nextContent == null) {
188-
continue;
204+
double prevScore = (prevIdx != -1) ? chunks.get(prevIdx).score() : Double.NEGATIVE_INFINITY;
205+
double nextScore = (nextIdx != -1) ? chunks.get(nextIdx).score() : Double.NEGATIVE_INFINITY;
206+
207+
int pick = (nextScore > prevScore) ? nextIdx : prevIdx;
208+
if (pick != -1) {
209+
String extra = offsetToContent.apply(chunks.get(pick));
210+
int remaining = fragmentCharSize - content.length() - CHUNK_DELIMITER.length();
211+
if (extra != null) {
212+
if (pick == nextIdx) {
213+
String toAppend = extra.length() > remaining ? extra.substring(0, remaining) : extra;
214+
content = content + CHUNK_DELIMITER + toAppend; // expand forward
215+
} else {
216+
String toPrepend = extra.length() > remaining ? extra.substring(extra.length() - remaining) : extra;
217+
content = toPrepend + CHUNK_DELIMITER + content; // expand backward
218+
}
219+
consumedChunks[pick] = true;
189220
}
190-
191-
concatenated.append(CHUNK_DELIMITER).append(nextContent);
192-
consumedChunks[nextIndex] = true;
193221
}
194222
}
195-
196-
content = concatenated.toString();
197223
}
198224

199225
// Truncate content if it exceeds fragmentCharSize

0 commit comments

Comments
 (0)