Add support for retrieving semantic_text's indexed chunks via fields API (#132410)

jimczi · web-flow · commit 347198787e3a · 2025-08-07T10:11:44.000+02:00
Introduces the "format": "chunks" option for the fields parameter in _search requests.
Allows users to retrieve the original text chunks generated by a semantic field’s chunking strategy.

Example usage:
```
POST test-index/_search
{
  "query": {
    "ids" : {
      "values" : ["1"]
    }
  },
  "fields": [
    {
      "field": "semantic_text_field",
      "format": "chunks"      &lt;1&gt;
    }
  ]
}
```
diff --git a/docs/changelog/132410.yaml b/docs/changelog/132410.yaml
@@ -0,0 +1,5 @@
+pr: 132410
+summary: Add support for retrieving semantic_text's indexed chunks via fields API
+area: Vector Search
+type: feature
+issues: []
diff --git a/docs/reference/elasticsearch/mapping-reference/semantic-text.md b/docs/reference/elasticsearch/mapping-reference/semantic-text.md
@@ -282,6 +282,34 @@ PUT test-index/_doc/1
     * Others (such as `elastic` and `elasticsearch`) will automatically truncate
       the input.
 
+## Retrieving indexed chunks
+```{applies_to}
+stack: ga 9.2
+serverless: ga
+```
+
+You can retrieve the individual chunks generated by your semantic field’s chunking
+strategy using the [fields parameter](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#search-fields-param):
+
+```console
+POST test-index/_search
+{
+  "query": {
+    "ids" : {
+      "values" : ["1"]
+    }
+  },
+  "fields": [
+    {
+      "field": "semantic_text_field",
+      "format": "chunks"      <1>
+    }
+  ]
+}
+```
+
+1. Use `"format": "chunks"` to return the field’s text as the original text chunks that were indexed.
+
 ## Extracting relevant fragments from semantic text [semantic-text-highlighting]
 
 You can extract the most relevant fragments from a semantic text field by using
@@ -311,27 +339,6 @@ POST test-index/_search
 2. Sorts the most relevant highlighted fragments by score when set to `score`. By default,
    fragments will be output in the order they appear in the field (order: none).
 
-To use the `semantic` highlighter to view chunks in the order which they were indexed with no scoring,
-use the `match_all` query to retrieve them in the order they appear in the document:
-
-```console
-POST test-index/_search
-{
-    "query": {
-        "match_all": {}
-    },
-    "highlight": {
-        "fields": {
-            "my_semantic_field": {
-                "number_of_fragments": 5  <1>
-            }
-        }
-    }
-}
-```
-
-1. This will return the first 5 chunks, set this number higher to retrieve more chunks.
-
 Highlighting is supported on fields other than semantic_text. However, if you
 want to restrict highlighting to the semantic highlighter and return no
 fragments when the field is not of type semantic_text, you can explicitly
@@ -359,6 +366,31 @@ PUT test-index
 
 1. Ensures that highlighting is applied exclusively to semantic_text fields.
 
+To retrieve all fragments from the `semantic` highlighter in their original indexing order
+without scoring, use a `match_all` query as the `highlight_query`.
+This ensures fragments are returned in the order they appear in the document:
+
+```console
+POST test-index/_search
+{
+  "query": {
+    "ids": {
+      "values": ["1"]
+    }
+  },
+  "highlight": {
+    "fields": {
+      "my_semantic_field": {
+        "number_of_fragments": 5,        <1>
+        "highlight_query": { "match_all": {} }
+      }
+    }
+  }
+}
+```
+
+1. Returns the first 5 fragments. Increase this value to retrieve additional fragments.
+
 ## Updates and partial updates for `semantic_text` fields [semantic-text-updates]
 
 When updating documents that contain `semantic_text` fields, it’s important to understand how inference is triggered:
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
@@ -47,6 +47,7 @@ public class InferenceFeatures implements FeatureSpecification {
     private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter");
     private static final NodeFeature COHERE_V2_API = new NodeFeature("inference.cohere.v2");
     public static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTING_FLAT = new NodeFeature("semantic_text.highlighter.flat_index_options");
+    private static final NodeFeature SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT = new NodeFeature("semantic_text.fields_chunks_format");
 
     @Override
     public Set<NodeFeature> getTestFeatures() {
@@ -80,7 +81,8 @@ public Set<NodeFeature> getTestFeatures() {
                 SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS,
                 SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX,
                 SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
-                SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS
+                SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS,
+                SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT
             )
         );
         if (RERANK_SNIPPETS.isEnabled()) {
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java
@@ -66,7 +66,7 @@ public ValueFetcher valueFetcher(MappingLookup mappingLookup, Function<Query, Bi
             for (var inferenceField : mappingLookup.inferenceFields().keySet()) {
                 MappedFieldType ft = mappingLookup.getFieldType(inferenceField);
                 if (ft instanceof SemanticTextFieldMapper.SemanticTextFieldType semanticTextFieldType) {
-                    fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher));
+                    fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher, false));
                 } else {
                     throw new IllegalArgumentException(
                         "Invalid inference field [" + ft.name() + "]. Expected field type [semantic_text] but got [" + ft.typeName() + "]"
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
@@ -89,6 +89,7 @@
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -104,6 +105,7 @@
 import static org.elasticsearch.index.IndexVersions.SEMANTIC_TEXT_DEFAULTS_TO_BBQ_BACKPORT_8_X;
 import static org.elasticsearch.inference.TaskType.SPARSE_EMBEDDING;
 import static org.elasticsearch.inference.TaskType.TEXT_EMBEDDING;
+import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 import static org.elasticsearch.search.SearchService.DEFAULT_SIZE;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_OFFSET_FIELD;
@@ -864,14 +866,26 @@ public Query existsQuery(SearchExecutionContext context) {
 
         @Override
         public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
+            if (format != null && "chunks".equals(format) == false) {
+                throw new IllegalArgumentException(
+                    "Unknown format [" + format + "] for field [" + name() + "], only [chunks] is supported."
+                );
+            }
+            if (format != null) {
+                return valueFetcherWithInferenceResults(getChunksField().bitsetProducer(), context.searcher(), true);
+            }
             if (useLegacyFormat) {
                 // Redirect the fetcher to load the original values of the field
                 return SourceValueFetcher.toString(getOriginalTextFieldName(name()), context, format);
             }
             return SourceValueFetcher.toString(name(), context, null);
         }
 
-        ValueFetcher valueFetcherWithInferenceResults(Function<Query, BitSetProducer> bitSetCache, IndexSearcher searcher) {
+        ValueFetcher valueFetcherWithInferenceResults(
+            Function<Query, BitSetProducer> bitSetCache,
+            IndexSearcher searcher,
+            boolean onlyTextChunks
+        ) {
             var embeddingsField = getEmbeddingsField();
             if (embeddingsField == null) {
                 return ValueFetcher.EMPTY;
@@ -884,7 +898,7 @@ ValueFetcher valueFetcherWithInferenceResults(Function<Query, BitSetProducer> bi
                     org.apache.lucene.search.ScoreMode.COMPLETE_NO_SCORES,
                     1
                 );
-                return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader);
+                return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader, onlyTextChunks);
             } catch (IOException exc) {
                 throw new UncheckedIOException(exc);
             }
@@ -1022,6 +1036,7 @@ private class SemanticTextFieldValueFetcher implements ValueFetcher {
             private final BitSetProducer parentBitSetProducer;
             private final Weight childWeight;
             private final SourceLoader.SyntheticFieldLoader fieldLoader;
+            private final boolean onlyTextChunks;
 
             private BitSet bitSet;
             private Scorer childScorer;
@@ -1031,11 +1046,13 @@ private class SemanticTextFieldValueFetcher implements ValueFetcher {
             private SemanticTextFieldValueFetcher(
                 BitSetProducer bitSetProducer,
                 Weight childWeight,
-                SourceLoader.SyntheticFieldLoader fieldLoader
+                SourceLoader.SyntheticFieldLoader fieldLoader,
+                boolean onlyTextChunks
             ) {
                 this.parentBitSetProducer = bitSetProducer;
                 this.childWeight = childWeight;
                 this.fieldLoader = fieldLoader;
+                this.onlyTextChunks = onlyTextChunks;
             }
 
             @Override
@@ -1046,7 +1063,9 @@ public void setNextReader(LeafReaderContext context) {
                     if (childScorer != null) {
                         childScorer.iterator().nextDoc();
                     }
-                    dvLoader = fieldLoader.docValuesLoader(context.reader(), null);
+                    if (onlyTextChunks == false) {
+                        dvLoader = fieldLoader.docValuesLoader(context.reader(), null);
+                    }
                     var terms = context.reader().terms(getOffsetsFieldName(name()));
                     offsetsLoader = terms != null ? OffsetSourceField.loader(terms) : null;
                 } catch (IOException exc) {
@@ -1064,35 +1083,46 @@ public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValu
                 if (it.docID() < previousParent) {
                     it.advance(previousParent);
                 }
+
+                return onlyTextChunks ? fetchTextChunks(source, doc, it) : fetchFullField(source, doc, it);
+            }
+
+            private List<Object> fetchTextChunks(Source source, int doc, DocIdSetIterator it) throws IOException {
+                Map<String, String> originalValueMap = new HashMap<>();
+                List<Object> chunks = new ArrayList<>();
+
+                iterateChildDocs(doc, it, offset -> {
+                    var rawValue = originalValueMap.computeIfAbsent(offset.field(), k -> {
+                        var valueObj = XContentMapValues.extractValue(offset.field(), source.source(), null);
+                        var values = SemanticTextUtils.nodeStringValues(offset.field(), valueObj).stream().toList();
+                        return Strings.collectionToDelimitedString(values, String.valueOf(MULTIVAL_SEP_CHAR));
+                    });
+
+                    chunks.add(rawValue.substring(offset.start(), offset.end()));
+                });
+
+                return chunks;
+            }
+
+            private List<Object> fetchFullField(Source source, int doc, DocIdSetIterator it) throws IOException {
                 Map<String, List<SemanticTextField.Chunk>> chunkMap = new LinkedHashMap<>();
-                while (it.docID() < doc) {
-                    if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) {
-                        throw new IllegalStateException(
-                            "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]"
-                        );
-                    }
-                    var offset = offsetsLoader.advanceTo(it.docID());
-                    if (offset == null) {
-                        throw new IllegalStateException(
-                            "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]"
-                        );
-                    }
-                    var chunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>());
-                    chunks.add(
+
+                iterateChildDocs(doc, it, offset -> {
+                    var fullChunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>());
+                    fullChunks.add(
                         new SemanticTextField.Chunk(
                             null,
                             offset.start(),
                             offset.end(),
                             rawEmbeddings(fieldLoader::write, source.sourceContentType())
                         )
                     );
-                    if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
-                        break;
-                    }
-                }
+                });
+
                 if (chunkMap.isEmpty()) {
                     return List.of();
                 }
+
                 return List.of(
                     new SemanticTextField(
                         useLegacyFormat,
@@ -1104,6 +1134,38 @@ public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValu
                 );
             }
 
+            /**
+             * Iterates over all child documents for the given doc and applies the provided action for each valid offset.
+             */
+            private void iterateChildDocs(
+                int doc,
+                DocIdSetIterator it,
+                CheckedConsumer<OffsetSourceFieldMapper.OffsetSource, IOException> action
+            ) throws IOException {
+                while (it.docID() < doc) {
+                    if (onlyTextChunks == false) {
+                        if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) {
+                            throw new IllegalStateException(
+                                "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]"
+                            );
+                        }
+                    }
+
+                    var offset = offsetsLoader.advanceTo(it.docID());
+                    if (offset == null) {
+                        throw new IllegalStateException(
+                            "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]"
+                        );
+                    }
+
+                    action.accept(offset);
+
+                    if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
+                        break;
+                    }
+                }
+            }
+
             private BytesReference rawEmbeddings(CheckedConsumer<XContentBuilder, IOException> writer, XContentType xContentType)
                 throws IOException {
                 try (var result = XContentFactory.contentBuilder(xContentType)) {
diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml
@@ -671,3 +671,76 @@ setup:
   - length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
   - match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
 
+---
+"Retrieve chunks with the fields api":
+  - requires:
+      cluster_features: "semantic_text.fields_chunks_format"
+      reason: semantic text field supports retrieving chunks through fields API in 9.2.0.
+
+  - do:
+      indices.create:
+        index: test-index-sparse
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: false
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+              text_field:
+                type: text
+                copy_to: ["semantic_text_field"]
+
+  - do:
+      index:
+        index: test-index-sparse
+        id: doc_1
+        body:
+          semantic_text_field: [ "some test data", "    ", "now with chunks" ]
+          text_field: "text field data"
+        refresh: true
+
+  - do:
+      search:
+        index: test-index-sparse
+        body:
+          query:
+            match_all: { }
+          fields: [{"field": "semantic_text_field", "format": "chunks"}]
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.fields.semantic_text_field: 3 }
+  - match: { hits.hits.0.fields.semantic_text_field.0: "some test data" }
+  - match: { hits.hits.0.fields.semantic_text_field.1: "now with chunks" }
+  - match: { hits.hits.0.fields.semantic_text_field.2: "text field data" }
+
+---
+"Highlighting with match_all in a highlight_query":
+  - requires:
+      cluster_features: "semantic_text.match_all_highlighter"
+      reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            ids: {
+              values: ["doc_1"]
+            }
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+                highlight_query: {
+                  match_all: {}
+                }
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }