Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/132410.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 132410
summary: Add support for retrieving semantic_text's indexed chunks via fields API
area: Vector Search
type: feature
issues: []
45 changes: 24 additions & 21 deletions docs/reference/elasticsearch/mapping-reference/semantic-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,30 @@ PUT test-index/_doc/1
* Others (such as `elastic` and `elasticsearch`) will automatically truncate
the input.

## Retrieving indexed chunks

You can retrieve the individual chunks generated by your semantic field’s chunking
strategy using the [fields parameter](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#search-fields-param):

```console
POST test-index/_search
{
"query": {
"ids" : {
"values" : ["1"]
}
},
"fields": [
{
"field": "semantic_text_field",
"format": "chunks" <1>
}
]
}
```

1. Use `"format": "chunks"` to return the field’s text as the original text chunks that were indexed.

## Extracting relevant fragments from semantic text [semantic-text-highlighting]

You can extract the most relevant fragments from a semantic text field by using
Expand Down Expand Up @@ -311,27 +335,6 @@ POST test-index/_search
2. Sorts the most relevant highlighted fragments by score when set to `score`. By default,
fragments will be output in the order they appear in the field (order: none).

To use the `semantic` highlighter to view chunks in the order which they were indexed with no scoring,
use the `match_all` query to retrieve them in the order they appear in the document:

```console
POST test-index/_search
{
"query": {
"match_all": {}
},
"highlight": {
"fields": {
"my_semantic_field": {
"number_of_fragments": 5 <1>
}
}
}
}
```

1. This will return the first 5 chunks, set this number higher to retrieve more chunks.

Highlighting is supported on fields other than semantic_text. However, if you
want to restrict highlighting to the semantic highlighter and return no
fragments when the field is not of type semantic_text, you can explicitly
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public class InferenceFeatures implements FeatureSpecification {
private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter");
private static final NodeFeature COHERE_V2_API = new NodeFeature("inference.cohere.v2");
public static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTING_FLAT = new NodeFeature("semantic_text.highlighter.flat_index_options");
private static final NodeFeature SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT = new NodeFeature("semantic_text.fields_chunks_format");

@Override
public Set<NodeFeature> getTestFeatures() {
Expand Down Expand Up @@ -80,7 +81,8 @@ public Set<NodeFeature> getTestFeatures() {
SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS,
SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX,
SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS
SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS,
SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT
)
);
if (RERANK_SNIPPETS.isEnabled()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public ValueFetcher valueFetcher(MappingLookup mappingLookup, Function<Query, Bi
for (var inferenceField : mappingLookup.inferenceFields().keySet()) {
MappedFieldType ft = mappingLookup.getFieldType(inferenceField);
if (ft instanceof SemanticTextFieldMapper.SemanticTextFieldType semanticTextFieldType) {
fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher));
fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher, false));
} else {
throw new IllegalArgumentException(
"Invalid inference field [" + ft.name() + "]. Expected field type [semantic_text] but got [" + ft.typeName() + "]"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
Expand All @@ -104,6 +105,7 @@
import static org.elasticsearch.index.IndexVersions.SEMANTIC_TEXT_DEFAULTS_TO_BBQ_BACKPORT_8_X;
import static org.elasticsearch.inference.TaskType.SPARSE_EMBEDDING;
import static org.elasticsearch.inference.TaskType.TEXT_EMBEDDING;
import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
import static org.elasticsearch.search.SearchService.DEFAULT_SIZE;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_OFFSET_FIELD;
Expand Down Expand Up @@ -864,14 +866,26 @@ public Query existsQuery(SearchExecutionContext context) {

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
if (format != null && "chunks".equals(format) == false) {
throw new IllegalArgumentException(
"Unknow format [" + format + "] for field [" + name() + "], only [chunks] is supported."
);
}
if (format != null) {
return valueFetcherWithInferenceResults(getChunksField().bitsetProducer(), context.searcher(), true);
}
if (useLegacyFormat) {
// Redirect the fetcher to load the original values of the field
return SourceValueFetcher.toString(getOriginalTextFieldName(name()), context, format);
}
return SourceValueFetcher.toString(name(), context, null);
}

ValueFetcher valueFetcherWithInferenceResults(Function<Query, BitSetProducer> bitSetCache, IndexSearcher searcher) {
ValueFetcher valueFetcherWithInferenceResults(
Function<Query, BitSetProducer> bitSetCache,
IndexSearcher searcher,
boolean onlyTextChunks
) {
var embeddingsField = getEmbeddingsField();
if (embeddingsField == null) {
return ValueFetcher.EMPTY;
Expand All @@ -884,7 +898,7 @@ ValueFetcher valueFetcherWithInferenceResults(Function<Query, BitSetProducer> bi
org.apache.lucene.search.ScoreMode.COMPLETE_NO_SCORES,
1
);
return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader);
return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader, onlyTextChunks);
} catch (IOException exc) {
throw new UncheckedIOException(exc);
}
Expand Down Expand Up @@ -1022,6 +1036,7 @@ private class SemanticTextFieldValueFetcher implements ValueFetcher {
private final BitSetProducer parentBitSetProducer;
private final Weight childWeight;
private final SourceLoader.SyntheticFieldLoader fieldLoader;
private final boolean onlyTextChunks;

private BitSet bitSet;
private Scorer childScorer;
Expand All @@ -1031,11 +1046,13 @@ private class SemanticTextFieldValueFetcher implements ValueFetcher {
private SemanticTextFieldValueFetcher(
BitSetProducer bitSetProducer,
Weight childWeight,
SourceLoader.SyntheticFieldLoader fieldLoader
SourceLoader.SyntheticFieldLoader fieldLoader,
boolean onlyTextChunks
) {
this.parentBitSetProducer = bitSetProducer;
this.childWeight = childWeight;
this.fieldLoader = fieldLoader;
this.onlyTextChunks = onlyTextChunks;
}

@Override
Expand Down Expand Up @@ -1064,35 +1081,46 @@ public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValu
if (it.docID() < previousParent) {
it.advance(previousParent);
}

return onlyTextChunks ? fetchTextChunks(source, doc, it) : fetchFullChunks(source, doc, it);
}

private List<Object> fetchTextChunks(Source source, int doc, DocIdSetIterator it) throws IOException {
Map<String, String> originalValueMap = new HashMap<>();
List<Object> chunks = new ArrayList<>();

iterateChildDocs(doc, it, offset -> {
var rawValue = originalValueMap.computeIfAbsent(offset.field(), k -> {
var valueObj = XContentMapValues.extractValue(offset.field(), source.source(), null);
var values = SemanticTextUtils.nodeStringValues(offset.field(), valueObj).stream().toList();
return Strings.collectionToDelimitedString(values, String.valueOf(MULTIVAL_SEP_CHAR));
});

chunks.add(rawValue.substring(offset.start(), offset.end()));
});

return chunks;
}

private List<Object> fetchFullChunks(Source source, int doc, DocIdSetIterator it) throws IOException {
Map<String, List<SemanticTextField.Chunk>> chunkMap = new LinkedHashMap<>();
while (it.docID() < doc) {
if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) {
throw new IllegalStateException(
"Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]"
);
}
var offset = offsetsLoader.advanceTo(it.docID());
if (offset == null) {
throw new IllegalStateException(
"Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]"
);
}
var chunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>());
chunks.add(

iterateChildDocs(doc, it, offset -> {
var fullChunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>());
fullChunks.add(
new SemanticTextField.Chunk(
null,
offset.start(),
offset.end(),
rawEmbeddings(fieldLoader::write, source.sourceContentType())
)
);
if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
});

if (chunkMap.isEmpty()) {
return List.of();
}

return List.of(
new SemanticTextField(
useLegacyFormat,
Expand All @@ -1104,6 +1132,36 @@ public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValu
);
}

/**
* Iterates over all child documents for the given doc and applies the provided action for each valid offset.
*/
private void iterateChildDocs(
int doc,
DocIdSetIterator it,
CheckedConsumer<OffsetSourceFieldMapper.OffsetSource, IOException> action
) throws IOException {
while (it.docID() < doc) {
if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) {
throw new IllegalStateException(
"Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]"
);
}

var offset = offsetsLoader.advanceTo(it.docID());
if (offset == null) {
throw new IllegalStateException(
"Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]"
);
}

action.accept(offset);

if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
}

private BytesReference rawEmbeddings(CheckedConsumer<XContentBuilder, IOException> writer, XContentType xContentType)
throws IOException {
try (var result = XContentFactory.contentBuilder(xContentType)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -671,3 +671,47 @@ setup:
- length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
- match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

---
"Retrieve chunks with the fields api":
- requires:
cluster_features: "semantic_text.match_all_highlighter"
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.

- do:
indices.create:
index: test-index-sparse
body:
settings:
index.mapping.semantic_text.use_legacy_format: false
mappings:
properties:
semantic_text_field:
type: semantic_text
inference_id: sparse-inference-id
text_field:
type: text
copy_to: ["semantic_text_field"]

- do:
index:
index: test-index-sparse
id: doc_1
body:
semantic_text_field: [ "some test data", " ", "now with chunks" ]
text_field: "text field data"
refresh: true

- do:
search:
index: test-index-sparse
body:
query:
match_all: { }
fields: [{"field": "semantic_text_field", "format": "chunks"}]

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.fields.semantic_text_field: 3 }
- match: { hits.hits.0.fields.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.fields.semantic_text_field.1: "now with chunks" }
- match: { hits.hits.0.fields.semantic_text_field.2: "text field data" }
Loading