Skip to content

Commit 38310be

Browse files
committed
Add Highlighter for Semantic Text Fields (#118064)
This PR introduces a new highlighter, `semantic`, tailored for semantic text fields. It extracts the most relevant fragments by scoring nested chunks using the original semantic query. In this initial version, the highlighter returns only the original chunks computed during ingestion. However, this is an implementation detail, and future enhancements could combine multiple chunks to generate the fragments.
1 parent 25fd1be commit 38310be

File tree

14 files changed

+1314
-48
lines changed

14 files changed

+1314
-48
lines changed

docs/changelog/118064.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 118064
2+
summary: Add Highlighter for Semantic Text Fields
3+
area: Highlighting
4+
type: feature
5+
issues: []

docs/reference/mapping/types/semantic-text.asciidoc

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -112,50 +112,43 @@ Trying to <<delete-inference-api,delete an {infer} endpoint>> that is used on a
112112
{infer-cap} endpoints have a limit on the amount of text they can process.
113113
To allow for large amounts of text to be used in semantic search, `semantic_text` automatically generates smaller passages if needed, called _chunks_.
114114

115-
Each chunk will include the text subpassage and the corresponding embedding generated from it.
115+
Each chunk refers to a passage of the text and the corresponding embedding generated from it.
116116
When querying, the individual passages will be automatically searched for each document, and the most relevant passage will be used to compute a score.
117117

118118
For more details on chunking and how to configure chunking settings, see <<infer-chunking-config, Configuring chunking>> in the Inference API documentation.
119119

120+
Refer to <<semantic-search-semantic-text,this tutorial>> to learn more about
121+
semantic search using `semantic_text` and the `semantic` query.
120122

121123
[discrete]
122-
[[semantic-text-structure]]
123-
==== `semantic_text` structure
124+
[[semantic-text-highlighting]]
125+
==== Extracting Relevant Fragments from Semantic Text
124126

125-
Once a document is ingested, a `semantic_text` field will have the following structure:
127+
You can extract the most relevant fragments from a semantic text field by using the <<highlighting,highlight parameter>> in the <<search-search-api-request-body,Search API>>.
126128

127-
[source,console-result]
129+
[source,console]
128130
------------------------------------------------------------
129-
"inference_field": {
130-
"text": "these are not the droids you're looking for", <1>
131-
"inference": {
132-
"inference_id": "my-elser-endpoint", <2>
133-
"model_settings": { <3>
134-
"task_type": "sparse_embedding"
131+
PUT test-index
132+
{
133+
"query": {
134+
"semantic": {
135+
"field": "my_semantic_field"
136+
}
135137
},
136-
"chunks": [ <4>
137-
{
138-
"text": "these are not the droids you're looking for",
139-
"embeddings": {
140-
(...)
138+
"highlight": {
139+
"fields": {
140+
"my_semantic_field": {
141+
"type": "semantic",
142+
"number_of_fragments": 2, <1>
143+
"order": "score" <2>
144+
}
141145
}
142-
}
143-
]
144-
}
146+
}
145147
}
146148
------------------------------------------------------------
147-
// TEST[skip:TBD]
148-
<1> The field will become an object structure to accommodate both the original
149-
text and the inference results.
150-
<2> The `inference_id` used to generate the embeddings.
151-
<3> Model settings, including the task type and dimensions/similarity if
152-
applicable.
153-
<4> Inference results will be grouped in chunks, each with its corresponding
154-
text and embeddings.
155-
156-
Refer to <<semantic-search-semantic-text,this tutorial>> to learn more about
157-
semantic search using `semantic_text` and the `semantic` query.
158-
149+
// TEST[skip:Requires inference endpoint]
150+
<1> Specifies the maximum number of fragments to return.
151+
<2> Sorts highlighted fragments by score when set to `score`. By default, fragments will be output in the order they appear in the field (order: none).
159152

160153
[discrete]
161154
[[custom-indexing]]

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,17 @@ public Set<NodeFeature> getFeatures() {
3131
);
3232
}
3333

34+
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
35+
3436
@Override
3537
public Set<NodeFeature> getTestFeatures() {
3638
return Set.of(
3739
SemanticTextFieldMapper.SEMANTIC_TEXT_IN_OBJECT_FIELD_FIX,
3840
SemanticTextFieldMapper.SEMANTIC_TEXT_SINGLE_FIELD_UPDATE_FIX,
3941
SemanticTextFieldMapper.SEMANTIC_TEXT_DELETE_FIX,
4042
SemanticTextFieldMapper.SEMANTIC_TEXT_ZERO_SIZE_FIX,
41-
SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX
43+
SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX,
44+
SEMANTIC_TEXT_HIGHLIGHTER
4245
);
4346
}
4447
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.elasticsearch.plugins.SystemIndexPlugin;
3636
import org.elasticsearch.rest.RestController;
3737
import org.elasticsearch.rest.RestHandler;
38+
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
3839
import org.elasticsearch.search.rank.RankBuilder;
3940
import org.elasticsearch.search.rank.RankDoc;
4041
import org.elasticsearch.threadpool.ExecutorBuilder;
@@ -65,6 +66,7 @@
6566
import org.elasticsearch.xpack.inference.external.http.retry.RetrySettings;
6667
import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender;
6768
import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings;
69+
import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter;
6870
import org.elasticsearch.xpack.inference.logging.ThrottlerManager;
6971
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
7072
import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
@@ -390,4 +392,9 @@ public List<RetrieverSpec<?>> getRetrievers() {
390392
new RetrieverSpec<>(new ParseField(RandomRankBuilder.NAME), RandomRankRetrieverBuilder::fromXContent)
391393
);
392394
}
395+
396+
@Override
397+
public Map<String, Highlighter> getHighlighters() {
398+
return Map.of(SemanticTextHighlighter.NAME, new SemanticTextHighlighter());
399+
}
393400
}
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.inference.highlight;
9+
10+
import org.apache.lucene.index.LeafReader;
11+
import org.apache.lucene.index.Term;
12+
import org.apache.lucene.search.BooleanClause;
13+
import org.apache.lucene.search.BooleanQuery;
14+
import org.apache.lucene.search.DocIdSetIterator;
15+
import org.apache.lucene.search.IndexSearcher;
16+
import org.apache.lucene.search.KnnByteVectorQuery;
17+
import org.apache.lucene.search.KnnFloatVectorQuery;
18+
import org.apache.lucene.search.Query;
19+
import org.apache.lucene.search.QueryVisitor;
20+
import org.apache.lucene.search.ScoreMode;
21+
import org.apache.lucene.search.Scorer;
22+
import org.apache.lucene.search.Weight;
23+
import org.elasticsearch.common.text.Text;
24+
import org.elasticsearch.common.xcontent.support.XContentMapValues;
25+
import org.elasticsearch.index.mapper.MappedFieldType;
26+
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DenseVectorFieldType;
27+
import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SparseVectorFieldType;
28+
import org.elasticsearch.index.query.SearchExecutionContext;
29+
import org.elasticsearch.search.fetch.subphase.highlight.FieldHighlightContext;
30+
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
31+
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
32+
import org.elasticsearch.search.vectors.VectorData;
33+
import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryWrapper;
34+
import org.elasticsearch.xpack.inference.mapper.SemanticTextField;
35+
import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
36+
37+
import java.io.IOException;
38+
import java.util.ArrayList;
39+
import java.util.Comparator;
40+
import java.util.List;
41+
import java.util.Locale;
42+
import java.util.Map;
43+
44+
/**
45+
* A {@link Highlighter} designed for the {@link SemanticTextFieldMapper}.
46+
* This highlighter extracts semantic queries and evaluates them against each chunk produced by the semantic text field.
47+
* It returns the top-scoring chunks as snippets, optionally sorted by their scores.
48+
*/
49+
public class SemanticTextHighlighter implements Highlighter {
50+
public static final String NAME = "semantic";
51+
52+
private record OffsetAndScore(int offset, float score) {}
53+
54+
@Override
55+
public boolean canHighlight(MappedFieldType fieldType) {
56+
if (fieldType instanceof SemanticTextFieldMapper.SemanticTextFieldType) {
57+
return true;
58+
}
59+
return false;
60+
}
61+
62+
@Override
63+
public HighlightField highlight(FieldHighlightContext fieldContext) throws IOException {
64+
SemanticTextFieldMapper.SemanticTextFieldType fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) fieldContext.fieldType;
65+
if (fieldType.getEmbeddingsField() == null) {
66+
// nothing indexed yet
67+
return null;
68+
}
69+
70+
final List<Query> queries = switch (fieldType.getModelSettings().taskType()) {
71+
case SPARSE_EMBEDDING -> extractSparseVectorQueries(
72+
(SparseVectorFieldType) fieldType.getEmbeddingsField().fieldType(),
73+
fieldContext.query
74+
);
75+
case TEXT_EMBEDDING -> extractDenseVectorQueries(
76+
(DenseVectorFieldType) fieldType.getEmbeddingsField().fieldType(),
77+
fieldContext.query
78+
);
79+
default -> throw new IllegalStateException(
80+
"Wrong task type for a semantic text field, got [" + fieldType.getModelSettings().taskType().name() + "]"
81+
);
82+
};
83+
if (queries.isEmpty()) {
84+
// nothing to highlight
85+
return null;
86+
}
87+
88+
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments() <= 0
89+
? 1 // we return the best fragment by default
90+
: fieldContext.field.fieldOptions().numberOfFragments();
91+
92+
List<OffsetAndScore> chunks = extractOffsetAndScores(
93+
fieldContext.context.getSearchExecutionContext(),
94+
fieldContext.hitContext.reader(),
95+
fieldType,
96+
fieldContext.hitContext.docId(),
97+
queries
98+
);
99+
if (chunks.size() == 0) {
100+
return null;
101+
}
102+
103+
chunks.sort(Comparator.comparingDouble(OffsetAndScore::score).reversed());
104+
int size = Math.min(chunks.size(), numberOfFragments);
105+
if (fieldContext.field.fieldOptions().scoreOrdered() == false) {
106+
chunks = chunks.subList(0, size);
107+
chunks.sort(Comparator.comparingInt(c -> c.offset));
108+
}
109+
Text[] snippets = new Text[size];
110+
List<Map<?, ?>> nestedSources = XContentMapValues.extractNestedSources(
111+
fieldType.getChunksField().fullPath(),
112+
fieldContext.hitContext.source().source()
113+
);
114+
for (int i = 0; i < size; i++) {
115+
var chunk = chunks.get(i);
116+
if (nestedSources.size() <= chunk.offset) {
117+
throw new IllegalStateException(
118+
String.format(
119+
Locale.ROOT,
120+
"Invalid content detected for field [%s]: the chunks size is [%d], "
121+
+ "but a reference to offset [%d] was found in the result.",
122+
fieldType.name(),
123+
nestedSources.size(),
124+
chunk.offset
125+
)
126+
);
127+
}
128+
String content = (String) nestedSources.get(chunk.offset).get(SemanticTextField.CHUNKED_TEXT_FIELD);
129+
if (content == null) {
130+
throw new IllegalStateException(
131+
String.format(
132+
Locale.ROOT,
133+
134+
"Invalid content detected for field [%s]: missing text for the chunk at offset [%d].",
135+
fieldType.name(),
136+
chunk.offset
137+
)
138+
);
139+
}
140+
snippets[i] = new Text(content);
141+
}
142+
return new HighlightField(fieldContext.fieldName, snippets);
143+
}
144+
145+
private List<OffsetAndScore> extractOffsetAndScores(
146+
SearchExecutionContext context,
147+
LeafReader reader,
148+
SemanticTextFieldMapper.SemanticTextFieldType fieldType,
149+
int docId,
150+
List<Query> leafQueries
151+
) throws IOException {
152+
var bitSet = context.bitsetFilter(fieldType.getChunksField().parentTypeFilter()).getBitSet(reader.getContext());
153+
int previousParent = docId > 0 ? bitSet.prevSetBit(docId - 1) : -1;
154+
155+
BooleanQuery.Builder bq = new BooleanQuery.Builder().add(fieldType.getChunksField().nestedTypeFilter(), BooleanClause.Occur.FILTER);
156+
leafQueries.stream().forEach(q -> bq.add(q, BooleanClause.Occur.SHOULD));
157+
Weight weight = new IndexSearcher(reader).createWeight(bq.build(), ScoreMode.COMPLETE, 1);
158+
Scorer scorer = weight.scorer(reader.getContext());
159+
if (previousParent != -1) {
160+
if (scorer.iterator().advance(previousParent) == DocIdSetIterator.NO_MORE_DOCS) {
161+
return List.of();
162+
}
163+
} else if (scorer.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
164+
return List.of();
165+
}
166+
List<OffsetAndScore> results = new ArrayList<>();
167+
int offset = 0;
168+
while (scorer.docID() < docId) {
169+
results.add(new OffsetAndScore(offset++, scorer.score()));
170+
if (scorer.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
171+
break;
172+
}
173+
}
174+
return results;
175+
}
176+
177+
private List<Query> extractDenseVectorQueries(DenseVectorFieldType fieldType, Query querySection) {
178+
// TODO: Handle knn section when semantic text field can be used.
179+
List<Query> queries = new ArrayList<>();
180+
querySection.visit(new QueryVisitor() {
181+
@Override
182+
public boolean acceptField(String field) {
183+
return fieldType.name().equals(field);
184+
}
185+
186+
@Override
187+
public void consumeTerms(Query query, Term... terms) {
188+
super.consumeTerms(query, terms);
189+
}
190+
191+
@Override
192+
public void visitLeaf(Query query) {
193+
if (query instanceof KnnFloatVectorQuery knnQuery) {
194+
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
195+
} else if (query instanceof KnnByteVectorQuery knnQuery) {
196+
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
197+
}
198+
}
199+
});
200+
return queries;
201+
}
202+
203+
private List<Query> extractSparseVectorQueries(SparseVectorFieldType fieldType, Query querySection) {
204+
List<Query> queries = new ArrayList<>();
205+
querySection.visit(new QueryVisitor() {
206+
@Override
207+
public boolean acceptField(String field) {
208+
return fieldType.name().equals(field);
209+
}
210+
211+
@Override
212+
public void consumeTerms(Query query, Term... terms) {
213+
super.consumeTerms(query, terms);
214+
}
215+
216+
@Override
217+
public QueryVisitor getSubVisitor(BooleanClause.Occur occur, Query parent) {
218+
if (parent instanceof SparseVectorQueryWrapper sparseVectorQuery) {
219+
queries.add(sparseVectorQuery.getTermsQuery());
220+
}
221+
return this;
222+
}
223+
});
224+
return queries;
225+
}
226+
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public record SemanticTextField(String fieldName, List<String> originalValues, I
6161
static final String SEARCH_INFERENCE_ID_FIELD = "search_inference_id";
6262
static final String CHUNKS_FIELD = "chunks";
6363
static final String CHUNKED_EMBEDDINGS_FIELD = "embeddings";
64-
static final String CHUNKED_TEXT_FIELD = "text";
64+
public static final String CHUNKED_TEXT_FIELD = "text";
6565
static final String MODEL_SETTINGS_FIELD = "model_settings";
6666
static final String TASK_TYPE_FIELD = "task_type";
6767
static final String DIMENSIONS_FIELD = "dimensions";

0 commit comments

Comments
 (0)