Inference Metadata Fields - Chunk On Delimiter (#118694)

Mikep86 · web-flow · commit cb86fd4cf68f · 2024-12-16T17:06:24.000-05:00
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
@@ -25,7 +25,6 @@
 import org.elasticsearch.action.update.UpdateRequest;
 import org.elasticsearch.cluster.metadata.InferenceFieldMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
-import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.util.concurrent.AtomicArray;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
 import org.elasticsearch.core.Nullable;
@@ -57,8 +56,6 @@
 import java.util.Map;
 import java.util.stream.Collectors;
 
-import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
-
 /**
  * A {@link MappedActionFilter} that intercepts {@link BulkShardRequest} to apply inference on fields specified
  * as {@link SemanticTextFieldMapper} in the index mapping. For each semantic text field referencing fields in
@@ -140,15 +137,17 @@ private record InferenceProvider(InferenceService service, Model model) {}
      * @param sourceField The source field.
      * @param input The input to run inference on.
      * @param inputOrder The original order of the input.
+     * @param offsetAdjustment The adjustment to apply to the chunk text offsets.
      */
-    private record FieldInferenceRequest(int index, String field, String sourceField, String input, int inputOrder) {}
+    private record FieldInferenceRequest(int index, String field, String sourceField, String input, int inputOrder, int offsetAdjustment) {}
 
     /**
      * The field inference response.
      * @param field The target field.
      * @param sourceField The input that was used to run inference.
      * @param input The input that was used to run inference.
      * @param inputOrder The original order of the input.
+     * @param offsetAdjustment The adjustment to apply to the chunk text offsets.
      * @param model The model used to run inference.
      * @param chunkedResults The actual results.
      */
@@ -157,6 +156,7 @@ private record FieldInferenceResponse(
         String sourceField,
         String input,
         int inputOrder,
+        int offsetAdjustment,
         Model model,
         ChunkedInference chunkedResults
     ) {}
@@ -317,6 +317,7 @@ public void onResponse(List<ChunkedInference> results) {
                                         request.sourceField(),
                                         request.input(),
                                         request.inputOrder(),
+                                        request.offsetAdjustment(),
                                         inferenceProvider.model,
                                         result
                                     )
@@ -402,6 +403,7 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
                     lst.addAll(
                         SemanticTextField.toSemanticTextFieldChunks(
                             resp.input,
+                            resp.offsetAdjustment,
                             resp.chunkedResults,
                             indexRequest.getContentType(),
                             addMetadataField
@@ -528,16 +530,14 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
                         }
 
                         List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
-                        if (useInferenceMetadataFieldsFormat) {
-                            // When using the inference metadata fields format, all the input values are concatenated so that the chunk
-                            // offsets are expressed in the context of a single string
-                            String concatenatedValue = Strings.collectionToDelimitedString(values, String.valueOf(MULTIVAL_SEP_CHAR));
-                            fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, concatenatedValue, order++));
-                        } else {
-                            // When using the legacy format, each input value is processed using its own inference request
-                            for (String v : values) {
-                                fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++));
-                            }
+                        int offsetAdjustment = 0;
+                        for (String v : values) {
+                            fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
+
+                            // When using the inference metadata fields format, all the input values are concatenated so that the
+                            // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment
+                            // to apply to account for this.
+                            offsetAdjustment += v.length() + 1; // Add one for separator char length
                         }
                     }
                 }
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java
@@ -406,23 +406,29 @@ private static List<Chunk> parseChunksArrayLegacy(XContentParser parser, ParserC
      */
     public static List<Chunk> toSemanticTextFieldChunks(
         String input,
+        int offsetAdjustment,
         ChunkedInference results,
         XContentType contentType,
         boolean useInferenceMetadataFieldsFormat
     ) throws IOException {
         List<Chunk> chunks = new ArrayList<>();
         Iterator<ChunkedInference.Chunk> it = results.chunksAsMatchedTextAndByteReference(contentType.xContent());
         while (it.hasNext()) {
-            chunks.add(toSemanticTextFieldChunk(input, it.next(), useInferenceMetadataFieldsFormat));
+            chunks.add(toSemanticTextFieldChunk(input, offsetAdjustment, it.next(), useInferenceMetadataFieldsFormat));
         }
         return chunks;
     }
 
-    public static Chunk toSemanticTextFieldChunk(String input, ChunkedInference.Chunk chunk, boolean useInferenceMetadataFieldsFormat) {
+    public static Chunk toSemanticTextFieldChunk(
+        String input,
+        int offsetAdjustment,
+        ChunkedInference.Chunk chunk,
+        boolean useInferenceMetadataFieldsFormat
+    ) {
         // TODO: Use offsets from ChunkedInferenceServiceResults
         // TODO: When using legacy semantic text format, build chunk text from offsets
         assert chunk.matchedText() != null; // TODO: Remove once offsets are available from chunk
-        int startOffset = useInferenceMetadataFieldsFormat ? input.indexOf(chunk.matchedText()) : -1;
+        int startOffset = useInferenceMetadataFieldsFormat ? input.indexOf(chunk.matchedText()) + offsetAdjustment : -1;
         return new Chunk(
             useInferenceMetadataFieldsFormat ? null : chunk.matchedText(),
             useInferenceMetadataFieldsFormat ? startOffset : -1,
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java
@@ -7,7 +7,6 @@
 
 package org.elasticsearch.xpack.inference.mapper;
 
-import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.index.IndexVersion;
@@ -37,10 +36,8 @@
 import java.util.Map;
 import java.util.function.Predicate;
 
-import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunk;
-import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunks;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 
@@ -236,31 +233,26 @@ public static SemanticTextField semanticTextFieldFromChunkedInferenceResults(
     ) throws IOException {
         final boolean useInferenceMetadataFields = InferenceMetadataFieldsMapper.isEnabled(indexVersion);
 
-        final List<SemanticTextField.Chunk> chunks;
-        if (useInferenceMetadataFields) {
-            // When using the inference metadata fields format, all the input values are concatenated so that the chunk offsets are
-            // expressed in the context of a single string
-            chunks = toSemanticTextFieldChunks(
-                Strings.collectionToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR)),
-                results,
-                contentType,
-                useInferenceMetadataFields
-            );
-        } else {
-            // When using the legacy format, each input value is processed using its own inference request.
-            // In this test framework, we don't perform "real" chunking; each input generates one chunk. Thus, we can assume there is a
-            // one-to-one relationship between inputs and chunks. Iterate over the inputs and chunks to match each input with its
-            // corresponding chunk.
-            chunks = new ArrayList<>(inputs.size());
-            Iterator<String> inputsIt = inputs.iterator();
-            Iterator<ChunkedInference.Chunk> chunkIt = results.chunksAsMatchedTextAndByteReference(contentType.xContent());
-            while (inputsIt.hasNext() && chunkIt.hasNext()) {
-                chunks.add(toSemanticTextFieldChunk(inputsIt.next(), chunkIt.next(), useInferenceMetadataFields));
-            }
+        // In this test framework, we don't perform "real" chunking; each input generates one chunk. Thus, we can assume there is a
+        // one-to-one relationship between inputs and chunks. Iterate over the inputs and chunks to match each input with its
+        // corresponding chunk.
+        final List<SemanticTextField.Chunk> chunks = new ArrayList<>(inputs.size());
+        int offsetAdjustment = 0;
+        Iterator<String> inputsIt = inputs.iterator();
+        Iterator<ChunkedInference.Chunk> chunkIt = results.chunksAsMatchedTextAndByteReference(contentType.xContent());
+        while (inputsIt.hasNext() && chunkIt.hasNext()) {
+            String input = inputsIt.next();
+            var chunk = chunkIt.next();
+            chunks.add(toSemanticTextFieldChunk(input, offsetAdjustment, chunk, useInferenceMetadataFields));
+
+            // When using the inference metadata fields format, all the input values are concatenated so that the
+            // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment
+            // to apply to account for this.
+            offsetAdjustment = input.length() + 1; // Add one for separator char length
+        }
 
-            if (inputsIt.hasNext() || chunkIt.hasNext()) {
-                throw new IllegalArgumentException("Input list size and chunk count do not match");
-            }
+        if (inputsIt.hasNext() || chunkIt.hasNext()) {
+            throw new IllegalArgumentException("Input list size and chunk count do not match");
         }
 
         return new SemanticTextField(