@@ -286,7 +286,11 @@ public static Object randomSemanticTextInput() {
286286 }
287287 }
288288
289- public static ChunkedInference toChunkedResult (Map <String , List <String >> matchedTextMap , SemanticTextField field ) throws IOException {
289+ public static ChunkedInference toChunkedResult (
290+ IndexVersion indexVersion ,
291+ Map <String , List <String >> matchedTextMap ,
292+ SemanticTextField field
293+ ) {
290294 switch (field .inference ().modelSettings ().taskType ()) {
291295 case SPARSE_EMBEDDING -> {
292296 List <ChunkedInferenceEmbeddingSparse .SparseEmbeddingChunk > chunks = new ArrayList <>();
@@ -297,14 +301,10 @@ public static ChunkedInference toChunkedResult(Map<String, List<String>> matched
297301
298302 ListIterator <String > matchedTextIt = entryFieldMatchedText .listIterator ();
299303 for (var chunk : entryChunks ) {
304+ String matchedText = matchedTextIt .next ();
305+ ChunkedInference .TextOffset offset = createOffset (indexVersion , chunk , matchedText );
300306 var tokens = parseWeightedTokens (chunk .rawEmbeddings (), field .contentType ());
301- chunks .add (
302- new ChunkedInferenceEmbeddingSparse .SparseEmbeddingChunk (
303- tokens ,
304- matchedTextIt .next (),
305- new ChunkedInference .TextOffset (chunk .startOffset (), chunk .endOffset ())
306- )
307- );
307+ chunks .add (new ChunkedInferenceEmbeddingSparse .SparseEmbeddingChunk (tokens , matchedText , offset ));
308308 }
309309 }
310310 return new ChunkedInferenceEmbeddingSparse (chunks );
@@ -318,6 +318,8 @@ public static ChunkedInference toChunkedResult(Map<String, List<String>> matched
318318
319319 ListIterator <String > matchedTextIt = entryFieldMatchedText .listIterator ();
320320 for (var chunk : entryChunks ) {
321+ String matchedText = matchedTextIt .next ();
322+ ChunkedInference .TextOffset offset = createOffset (indexVersion , chunk , matchedText );
321323 double [] values = parseDenseVector (
322324 chunk .rawEmbeddings (),
323325 field .inference ().modelSettings ().dimensions (),
@@ -326,8 +328,8 @@ public static ChunkedInference toChunkedResult(Map<String, List<String>> matched
326328 chunks .add (
327329 new ChunkedInferenceEmbeddingFloat .FloatEmbeddingChunk (
328330 FloatConversionUtils .floatArrayOf (values ),
329- matchedTextIt . next () ,
330- new ChunkedInference . TextOffset ( chunk . startOffset (), chunk . endOffset ())
331+ matchedText ,
332+ offset
331333 )
332334 );
333335 }
@@ -353,6 +355,24 @@ private static List<String> validateAndGetMatchedTextForField(
353355 return fieldMatchedText ;
354356 }
355357
358+ /**
359+ * Create a {@link ChunkedInference.TextOffset} instance with valid offset values. When using the legacy semantic text format, the
360+ * offset values are not written to {@link SemanticTextField.Chunk}, so we cannot read them from there. Instead, use the knowledge that
361+ * the matched text corresponds to one complete input value (i.e. one input value -> one chunk) to calculate the offset values.
362+ *
363+ * @param indexVersion The index version
364+ * @param chunk The chunk to get/calculate offset values for
365+ * @param matchedText The matched text to calculate offset values for
366+ * @return A {@link ChunkedInference.TextOffset} instance with valid offset values
367+ */
368+ private static ChunkedInference .TextOffset createOffset (IndexVersion indexVersion , SemanticTextField .Chunk chunk , String matchedText ) {
369+ final boolean useInferenceMetadataFields = InferenceMetadataFieldsMapper .isEnabled (indexVersion );
370+ final int startOffset = useInferenceMetadataFields ? chunk .startOffset () : 0 ;
371+ final int endOffset = useInferenceMetadataFields ? chunk .endOffset () : matchedText .length ();
372+
373+ return new ChunkedInference .TextOffset (startOffset , endOffset );
374+ }
375+
356376 private static double [] parseDenseVector (BytesReference value , int numDims , XContentType contentType ) {
357377 try (XContentParser parser = XContentHelper .createParserNotCompressed (XContentParserConfiguration .EMPTY , value , contentType )) {
358378 parser .nextToken ();
0 commit comments