diff --git a/docs/changelog/132410.yaml b/docs/changelog/132410.yaml new file mode 100644 index 0000000000000..a8a3a335c1df1 --- /dev/null +++ b/docs/changelog/132410.yaml @@ -0,0 +1,5 @@ +pr: 132410 +summary: Add support for retrieving semantic_text's indexed chunks via fields API +area: Vector Search +type: feature +issues: [] diff --git a/docs/reference/elasticsearch/mapping-reference/semantic-text.md b/docs/reference/elasticsearch/mapping-reference/semantic-text.md index 70f94d6b14539..2655ae8bc8bd0 100644 --- a/docs/reference/elasticsearch/mapping-reference/semantic-text.md +++ b/docs/reference/elasticsearch/mapping-reference/semantic-text.md @@ -282,6 +282,34 @@ PUT test-index/_doc/1 * Others (such as `elastic` and `elasticsearch`) will automatically truncate the input. +## Retrieving indexed chunks +```{applies_to} +stack: ga 9.2 +serverless: ga +``` + +You can retrieve the individual chunks generated by your semantic field’s chunking +strategy using the [fields parameter](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#search-fields-param): + +```console +POST test-index/_search +{ + "query": { + "ids" : { + "values" : ["1"] + } + }, + "fields": [ + { + "field": "semantic_text_field", + "format": "chunks" <1> + } + ] +} +``` + +1. Use `"format": "chunks"` to return the field’s text as the original text chunks that were indexed. + ## Extracting relevant fragments from semantic text [semantic-text-highlighting] You can extract the most relevant fragments from a semantic text field by using @@ -311,27 +339,6 @@ POST test-index/_search 2. Sorts the most relevant highlighted fragments by score when set to `score`. By default, fragments will be output in the order they appear in the field (order: none). -To use the `semantic` highlighter to view chunks in the order which they were indexed with no scoring, -use the `match_all` query to retrieve them in the order they appear in the document: - -```console -POST test-index/_search -{ - "query": { - "match_all": {} - }, - "highlight": { - "fields": { - "my_semantic_field": { - "number_of_fragments": 5 <1> - } - } - } -} -``` - -1. This will return the first 5 chunks, set this number higher to retrieve more chunks. - Highlighting is supported on fields other than semantic_text. However, if you want to restrict highlighting to the semantic highlighter and return no fragments when the field is not of type semantic_text, you can explicitly @@ -359,6 +366,31 @@ PUT test-index 1. Ensures that highlighting is applied exclusively to semantic_text fields. +To retrieve all fragments from the `semantic` highlighter in their original indexing order +without scoring, use a `match_all` query as the `highlight_query`. +This ensures fragments are returned in the order they appear in the document: + +```console +POST test-index/_search +{ + "query": { + "ids": { + "values": ["1"] + } + }, + "highlight": { + "fields": { + "my_semantic_field": { + "number_of_fragments": 5, <1> + "highlight_query": { "match_all": {} } + } + } + } +} +``` + +1. Returns the first 5 fragments. Increase this value to retrieve additional fragments. + ## Updates and partial updates for `semantic_text` fields [semantic-text-updates] When updating documents that contain `semantic_text` fields, it’s important to understand how inference is triggered: diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java index 996f4e601289a..fd160ae10fa6f 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java @@ -47,6 +47,7 @@ public class InferenceFeatures implements FeatureSpecification { private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter"); private static final NodeFeature COHERE_V2_API = new NodeFeature("inference.cohere.v2"); public static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTING_FLAT = new NodeFeature("semantic_text.highlighter.flat_index_options"); + private static final NodeFeature SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT = new NodeFeature("semantic_text.fields_chunks_format"); @Override public Set getTestFeatures() { @@ -80,7 +81,8 @@ public Set getTestFeatures() { SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS, SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX, SEMANTIC_TEXT_HIGHLIGHTING_FLAT, - SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS + SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS, + SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT ) ); if (RERANK_SNIPPETS.isEnabled()) { diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java index 607bc4f480ccd..4798dd81bc3b3 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java @@ -66,7 +66,7 @@ public ValueFetcher valueFetcher(MappingLookup mappingLookup, Function bitSetCache, IndexSearcher searcher) { + ValueFetcher valueFetcherWithInferenceResults( + Function bitSetCache, + IndexSearcher searcher, + boolean onlyTextChunks + ) { var embeddingsField = getEmbeddingsField(); if (embeddingsField == null) { return ValueFetcher.EMPTY; @@ -884,7 +898,7 @@ ValueFetcher valueFetcherWithInferenceResults(Function bi org.apache.lucene.search.ScoreMode.COMPLETE_NO_SCORES, 1 ); - return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader); + return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader, onlyTextChunks); } catch (IOException exc) { throw new UncheckedIOException(exc); } @@ -1022,6 +1036,7 @@ private class SemanticTextFieldValueFetcher implements ValueFetcher { private final BitSetProducer parentBitSetProducer; private final Weight childWeight; private final SourceLoader.SyntheticFieldLoader fieldLoader; + private final boolean onlyTextChunks; private BitSet bitSet; private Scorer childScorer; @@ -1031,11 +1046,13 @@ private class SemanticTextFieldValueFetcher implements ValueFetcher { private SemanticTextFieldValueFetcher( BitSetProducer bitSetProducer, Weight childWeight, - SourceLoader.SyntheticFieldLoader fieldLoader + SourceLoader.SyntheticFieldLoader fieldLoader, + boolean onlyTextChunks ) { this.parentBitSetProducer = bitSetProducer; this.childWeight = childWeight; this.fieldLoader = fieldLoader; + this.onlyTextChunks = onlyTextChunks; } @Override @@ -1046,7 +1063,9 @@ public void setNextReader(LeafReaderContext context) { if (childScorer != null) { childScorer.iterator().nextDoc(); } - dvLoader = fieldLoader.docValuesLoader(context.reader(), null); + if (onlyTextChunks == false) { + dvLoader = fieldLoader.docValuesLoader(context.reader(), null); + } var terms = context.reader().terms(getOffsetsFieldName(name())); offsetsLoader = terms != null ? OffsetSourceField.loader(terms) : null; } catch (IOException exc) { @@ -1064,21 +1083,33 @@ public List fetchValues(Source source, int doc, List ignoredValu if (it.docID() < previousParent) { it.advance(previousParent); } + + return onlyTextChunks ? fetchTextChunks(source, doc, it) : fetchFullField(source, doc, it); + } + + private List fetchTextChunks(Source source, int doc, DocIdSetIterator it) throws IOException { + Map originalValueMap = new HashMap<>(); + List chunks = new ArrayList<>(); + + iterateChildDocs(doc, it, offset -> { + var rawValue = originalValueMap.computeIfAbsent(offset.field(), k -> { + var valueObj = XContentMapValues.extractValue(offset.field(), source.source(), null); + var values = SemanticTextUtils.nodeStringValues(offset.field(), valueObj).stream().toList(); + return Strings.collectionToDelimitedString(values, String.valueOf(MULTIVAL_SEP_CHAR)); + }); + + chunks.add(rawValue.substring(offset.start(), offset.end())); + }); + + return chunks; + } + + private List fetchFullField(Source source, int doc, DocIdSetIterator it) throws IOException { Map> chunkMap = new LinkedHashMap<>(); - while (it.docID() < doc) { - if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) { - throw new IllegalStateException( - "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]" - ); - } - var offset = offsetsLoader.advanceTo(it.docID()); - if (offset == null) { - throw new IllegalStateException( - "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]" - ); - } - var chunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>()); - chunks.add( + + iterateChildDocs(doc, it, offset -> { + var fullChunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>()); + fullChunks.add( new SemanticTextField.Chunk( null, offset.start(), @@ -1086,13 +1117,12 @@ public List fetchValues(Source source, int doc, List ignoredValu rawEmbeddings(fieldLoader::write, source.sourceContentType()) ) ); - if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - } + }); + if (chunkMap.isEmpty()) { return List.of(); } + return List.of( new SemanticTextField( useLegacyFormat, @@ -1104,6 +1134,38 @@ public List fetchValues(Source source, int doc, List ignoredValu ); } + /** + * Iterates over all child documents for the given doc and applies the provided action for each valid offset. + */ + private void iterateChildDocs( + int doc, + DocIdSetIterator it, + CheckedConsumer action + ) throws IOException { + while (it.docID() < doc) { + if (onlyTextChunks == false) { + if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) { + throw new IllegalStateException( + "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]" + ); + } + } + + var offset = offsetsLoader.advanceTo(it.docID()); + if (offset == null) { + throw new IllegalStateException( + "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]" + ); + } + + action.accept(offset); + + if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + } + } + private BytesReference rawEmbeddings(CheckedConsumer writer, XContentType xContentType) throws IOException { try (var result = XContentFactory.contentBuilder(xContentType)) { diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml index 60dea800ca624..2827ca2ba479a 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml @@ -671,3 +671,76 @@ setup: - length: { hits.hits.0.highlight.bbq_hnsw_field: 1 } - match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } +--- +"Retrieve chunks with the fields api": + - requires: + cluster_features: "semantic_text.fields_chunks_format" + reason: semantic text field supports retrieving chunks through fields API in 9.2.0. + + - do: + indices.create: + index: test-index-sparse + body: + settings: + index.mapping.semantic_text.use_legacy_format: false + mappings: + properties: + semantic_text_field: + type: semantic_text + inference_id: sparse-inference-id + text_field: + type: text + copy_to: ["semantic_text_field"] + + - do: + index: + index: test-index-sparse + id: doc_1 + body: + semantic_text_field: [ "some test data", " ", "now with chunks" ] + text_field: "text field data" + refresh: true + + - do: + search: + index: test-index-sparse + body: + query: + match_all: { } + fields: [{"field": "semantic_text_field", "format": "chunks"}] + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.fields.semantic_text_field: 3 } + - match: { hits.hits.0.fields.semantic_text_field.0: "some test data" } + - match: { hits.hits.0.fields.semantic_text_field.1: "now with chunks" } + - match: { hits.hits.0.fields.semantic_text_field.2: "text field data" } + +--- +"Highlighting with match_all in a highlight_query": + - requires: + cluster_features: "semantic_text.match_all_highlighter" + reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0. + + - do: + search: + index: test-sparse-index + body: + query: + ids: { + values: ["doc_1"] + } + highlight: + fields: + body: + type: "semantic" + number_of_fragments: 2 + highlight_query: { + match_all: {} + } + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 2 } + - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }