2525import  org .elasticsearch .action .update .UpdateRequest ;
2626import  org .elasticsearch .cluster .metadata .InferenceFieldMetadata ;
2727import  org .elasticsearch .cluster .service .ClusterService ;
28- import  org .elasticsearch .common .Strings ;
2928import  org .elasticsearch .common .util .concurrent .AtomicArray ;
3029import  org .elasticsearch .common .xcontent .support .XContentMapValues ;
3130import  org .elasticsearch .core .Nullable ;
5756import  java .util .Map ;
5857import  java .util .stream .Collectors ;
5958
60- import  static  org .elasticsearch .lucene .search .uhighlight .CustomUnifiedHighlighter .MULTIVAL_SEP_CHAR ;
61- 
6259/** 
6360 * A {@link MappedActionFilter} that intercepts {@link BulkShardRequest} to apply inference on fields specified 
6461 * as {@link SemanticTextFieldMapper} in the index mapping. For each semantic text field referencing fields in 
@@ -140,15 +137,17 @@ private record InferenceProvider(InferenceService service, Model model) {}
140137     * @param sourceField The source field. 
141138     * @param input The input to run inference on. 
142139     * @param inputOrder The original order of the input. 
140+      * @param offsetAdjustment The adjustment to apply to the chunk text offsets. 
143141     */ 
144-     private  record  FieldInferenceRequest (int  index , String  field , String  sourceField , String  input , int  inputOrder ) {}
142+     private  record  FieldInferenceRequest (int  index , String  field , String  sourceField , String  input , int  inputOrder ,  int   offsetAdjustment ) {}
145143
146144    /** 
147145     * The field inference response. 
148146     * @param field The target field. 
149147     * @param sourceField The input that was used to run inference. 
150148     * @param input The input that was used to run inference. 
151149     * @param inputOrder The original order of the input. 
150+      * @param offsetAdjustment The adjustment to apply to the chunk text offsets. 
152151     * @param model The model used to run inference. 
153152     * @param chunkedResults The actual results. 
154153     */ 
@@ -157,6 +156,7 @@ private record FieldInferenceResponse(
157156        String  sourceField ,
158157        String  input ,
159158        int  inputOrder ,
159+         int  offsetAdjustment ,
160160        Model  model ,
161161        ChunkedInference  chunkedResults 
162162    ) {}
@@ -317,6 +317,7 @@ public void onResponse(List<ChunkedInference> results) {
317317                                        request .sourceField (),
318318                                        request .input (),
319319                                        request .inputOrder (),
320+                                         request .offsetAdjustment (),
320321                                        inferenceProvider .model ,
321322                                        result 
322323                                    )
@@ -402,6 +403,7 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
402403                    lst .addAll (
403404                        SemanticTextField .toSemanticTextFieldChunks (
404405                            resp .input ,
406+                             resp .offsetAdjustment ,
405407                            resp .chunkedResults ,
406408                            indexRequest .getContentType (),
407409                            addMetadataField 
@@ -528,16 +530,14 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
528530                        }
529531
530532                        List <FieldInferenceRequest > fieldRequests  = fieldRequestsMap .computeIfAbsent (inferenceId , k  -> new  ArrayList <>());
531-                         if  (useInferenceMetadataFieldsFormat ) {
532-                             // When using the inference metadata fields format, all the input values are concatenated so that the chunk 
533-                             // offsets are expressed in the context of a single string 
534-                             String  concatenatedValue  = Strings .collectionToDelimitedString (values , String .valueOf (MULTIVAL_SEP_CHAR ));
535-                             fieldRequests .add (new  FieldInferenceRequest (itemIndex , field , sourceField , concatenatedValue , order ++));
536-                         } else  {
537-                             // When using the legacy format, each input value is processed using its own inference request 
538-                             for  (String  v  : values ) {
539-                                 fieldRequests .add (new  FieldInferenceRequest (itemIndex , field , sourceField , v , order ++));
540-                             }
533+                         int  offsetAdjustment  = 0 ;
534+                         for  (String  v  : values ) {
535+                             fieldRequests .add (new  FieldInferenceRequest (itemIndex , field , sourceField , v , order ++, offsetAdjustment ));
536+ 
537+                             // When using the inference metadata fields format, all the input values are concatenated so that the 
538+                             // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment 
539+                             // to apply to account for this. 
540+                             offsetAdjustment  += v .length () + 1 ; // Add one for separator char length 
541541                        }
542542                    }
543543                }
0 commit comments