Skip to content

Commit d9dc935

Browse files
authored
[9.1] fix(semantic highlighter): add vector similarity queries and bbq_disk support (elastic#138140) (elastic#138555)
1 parent 1704e95 commit d9dc935

File tree

9 files changed

+194
-10
lines changed

9 files changed

+194
-10
lines changed

docs/changelog/138140.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 138140
2+
summary: "Fix semantic highlighting when using a `knn` query with minimum `similarity`"
3+
area: Relevance
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/search/vectors/IVFKnnFloatVectorQuery.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ public IVFKnnFloatVectorQuery(String field, float[] query, int k, int numCands,
3939
this.query = query;
4040
}
4141

42+
public float[] getQuery() {
43+
return query;
44+
}
45+
4246
@Override
4347
public String toString(String field) {
4448
StringBuilder buffer = new StringBuilder();

server/src/main/java/org/elasticsearch/search/vectors/VectorSimilarityQuery.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,11 @@ public VectorSimilarityQuery(Query innerKnnQuery, float similarity, float docSco
4848
this.innerKnnQuery = innerKnnQuery;
4949
}
5050

51-
// For testing
52-
Query getInnerKnnQuery() {
51+
public Query getInnerKnnQuery() {
5352
return innerKnnQuery;
5453
}
5554

56-
float getSimilarity() {
55+
public float getSimilarity() {
5756
return similarity;
5857
}
5958

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ public class InferenceFeatures implements FeatureSpecification {
3131

3232
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
3333
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT = new NodeFeature("semantic_text.highlighter.default");
34+
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_VECTOR_SIMILARITY_SUPPORT = new NodeFeature(
35+
"semantic_text.highlighter.vector_similarity_support"
36+
);
3437
private static final NodeFeature TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE = new NodeFeature(
3538
"test_reranking_service.parse_text_as_score"
3639
);
@@ -74,6 +77,7 @@ public Set<NodeFeature> getTestFeatures() {
7477
COHERE_V2_API,
7578
SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX,
7679
SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
80+
SEMANTIC_TEXT_HIGHLIGHTER_VECTOR_SIMILARITY_SUPPORT,
7781
SemanticQueryBuilder.SEMANTIC_QUERY_FILTER_FIELD_CAPS_FIX
7882
);
7983
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@
3333
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
3434
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
3535
import org.elasticsearch.search.vectors.DenseVectorQuery;
36+
import org.elasticsearch.search.vectors.RescoreKnnVectorQuery;
3637
import org.elasticsearch.search.vectors.SparseVectorQueryWrapper;
3738
import org.elasticsearch.search.vectors.VectorData;
39+
import org.elasticsearch.search.vectors.VectorSimilarityQuery;
3840
import org.elasticsearch.xcontent.Text;
3941
import org.elasticsearch.xpack.inference.mapper.OffsetSourceField;
4042
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
@@ -266,18 +268,26 @@ public void consumeTerms(Query query, Term... terms) {
266268
super.consumeTerms(query, terms);
267269
}
268270

269-
@Override
270-
public void visitLeaf(Query query) {
271+
private void visitLeaf(Query query, Float similarity) {
271272
if (query instanceof KnnFloatVectorQuery knnQuery) {
272-
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
273+
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), similarity));
273274
} else if (query instanceof KnnByteVectorQuery knnQuery) {
274-
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
275+
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), similarity));
275276
} else if (query instanceof MatchAllDocsQuery) {
276277
queries.add(new MatchAllDocsQuery());
277278
} else if (query instanceof DenseVectorQuery.Floats floatsQuery) {
278-
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(floatsQuery.getQuery()), null));
279+
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(floatsQuery.getQuery()), similarity));
280+
} else if (query instanceof RescoreKnnVectorQuery rescoreQuery) {
281+
visitLeaf(rescoreQuery.innerQuery(), similarity);
282+
} else if (query instanceof VectorSimilarityQuery similarityQuery) {
283+
visitLeaf(similarityQuery.getInnerKnnQuery(), similarityQuery.getSimilarity());
279284
}
280285
}
286+
287+
@Override
288+
public void visitLeaf(Query query) {
289+
visitLeaf(query, null);
290+
}
281291
});
282292
return queries;
283293
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,30 @@ public void testNoSemanticField() throws Exception {
200200
);
201201
}
202202

203+
@SuppressWarnings("unchecked")
204+
public void testDenseVectorWithSimilarityThreshold() throws Exception {
205+
var mapperService = createDefaultMapperService(useLegacyFormat);
206+
Map<String, Object> queryMap = (Map<String, Object>) queries.get("dense_vector_1");
207+
float[] vector = readDenseVector(queryMap.get("embeddings"));
208+
var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_E5);
209+
210+
KnnVectorQueryBuilder knnQuery = new KnnVectorQueryBuilder(fieldType.getEmbeddingsField().fullPath(), vector, 10, 10, null, 0.85f);
211+
NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), knnQuery, ScoreMode.Max);
212+
var shardRequest = createShardSearchRequest(nestedQueryBuilder);
213+
var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON);
214+
215+
String[] expectedPassages = ((List<String>) queryMap.get("expected_with_similarity_threshold")).toArray(String[]::new);
216+
assertHighlightOneDoc(
217+
mapperService,
218+
shardRequest,
219+
sourceToParse,
220+
SEMANTIC_FIELD_E5,
221+
expectedPassages.length,
222+
HighlightBuilder.Order.SCORE,
223+
expectedPassages
224+
);
225+
}
226+
203227
private MapperService createDefaultMapperService(boolean useLegacyFormat) throws IOException {
204228
var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json"));
205229
var settings = Settings.builder()

x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,9 @@
399399
"After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n",
400400
"In March 2001, Bertrand Delanoë became the first socialist mayor. He was re-elected in March 2008. In 2007, in an effort to reduce car traffic, he introduced the Vélib', a system which rents bicycles. Bertrand Delanoë also transformed a section of the highway along the Left Bank of the Seine into an urban promenade and park, the Promenade des Berges de la Seine, which he inaugurated in June 2013.\n\n\nIn 2007, President Nicolas Sarkozy launched the Grand Paris project, to integrate Paris more closely with the towns in the region around it. After many modifications, the new area, named the Metropolis of Grand Paris, with a population of 6.7 million, was created on 1 January 2016. In 2011, the City of Paris and the national government approved the plans for the Grand Paris Express, totalling 205 km (127 mi) of automated metro lines to connect Paris, the innermost three departments around Paris, airports and high-speed rail (TGV) stations, at an estimated cost of €35 billion. The system is scheduled to be completed by 2030.\n\n\nIn January 2015, Al-Qaeda in the Arabian Peninsula claimed attacks across the Paris region. 1.5 million people marched in Paris in a show of solidarity against terrorism and in support of freedom of speech. In November of the same year, terrorist attacks, claimed by ISIL, killed 130 people and injured more than 350.\n\n\n",
401401
"Bal-musette is a style of French music and dance that first became popular in Paris in the 1870s and 1880s; by 1880 Paris had some 150 dance halls. Patrons danced the bourrée to the accompaniment of the cabrette (a bellows-blown bagpipe locally called a \"musette\") and often the vielle à roue (hurdy-gurdy) in the cafés and bars of the city. Parisian and Italian musicians who played the accordion adopted the style and established themselves in Auvergnat bars, and Paris became a major centre for jazz and still attracts jazz musicians from all around the world to its clubs and cafés.\n\n\nParis is the spiritual home of gypsy jazz in particular, and many of the Parisian jazzmen who developed in the first half of the 20th century began by playing Bal-musette in the city. Django Reinhardt rose to fame in Paris, having moved to the 18th arrondissement in a caravan as a young boy, and performed with violinist Stéphane Grappelli and their Quintette du Hot Club de France in the 1930s and 1940s.\n\n\nImmediately after the War the Saint-Germain-des-Pres quarter and the nearby Saint-Michel quarter became home to many small jazz clubs, including the Caveau des Lorientais, the Club Saint-Germain, the Rose Rouge, the Vieux-Colombier, and the most famous, Le Tabou. They introduced Parisians to the music of Claude Luter, Boris Vian, Sydney Bechet, Mezz Mezzrow, and Henri Salvador. "
402+
],
403+
"expected_with_similarity_threshold": [
404+
"\nParis (.mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}French pronunciation: ⓘ) is the capital and largest city of France. With an estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 (41 sq mi), Paris is the fourth-largest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. Because of its leading role in the arts and sciences and its early adaptation of extensive street lighting, it became known as the City of Light in the 19th century.\n\n\nThe City of Paris is the centre of the Île-de-France region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19% of the population of France. The Paris Region had a nominal GDP of €765 billion (US$1.064 trillion when adjusted for PPP) in 2021, the highest in the European Union. According to the Economist Intelligence Unit Worldwide Cost of Living Survey, in 2022, Paris was the city with the ninth-highest cost of living in the world.\n\n\n"
402405
]
403406
},
404407
"sparse_vector_1": {
@@ -464,4 +467,4 @@
464467
"Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n"
465468
]
466469
}
467-
}
470+
}

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ setup:
9898
title: "Elasticsearch"
9999
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
100100
refresh: true
101-
102101
---
103102
"Highlighting empty field":
104103
- do:
@@ -671,3 +670,72 @@ setup:
671670
- length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
672671
- match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
673672

673+
---
674+
"Highlighting with knn with similarity":
675+
- requires:
676+
cluster_features: "semantic_text.highlighter.vector_similarity_support"
677+
reason: semantic highlighter fix for knn with similarity
678+
679+
- do:
680+
index:
681+
index: test-dense-index
682+
id: doc_1
683+
body:
684+
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!", "For a moment, nothing happened. Then, after a second or so, nothing continued to happen." ]
685+
- do:
686+
index:
687+
index: test-dense-index
688+
id: doc_2
689+
body:
690+
body: [ "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws."]
691+
refresh: true
692+
693+
- do:
694+
search:
695+
index: test-dense-index
696+
body:
697+
query:
698+
match_all: { }
699+
highlight:
700+
fields:
701+
body:
702+
type: "semantic"
703+
number_of_fragments: 1
704+
705+
- match: { hits.total.value: 2 }
706+
707+
- match: { hits.hits.0._id: "doc_1" }
708+
- length: { hits.hits.0.highlight: 1 }
709+
- length: { hits.hits.0.highlight.body: 1 }
710+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
711+
712+
- match: { hits.hits.1._id: "doc_2" }
713+
- length: { hits.hits.1.highlight: 1 }
714+
- length: { hits.hits.1.highlight.body: 1 }
715+
- match: { hits.hits.1.highlight.body.0: "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws." }
716+
717+
- do:
718+
search:
719+
index: test-dense-index
720+
body:
721+
query:
722+
knn:
723+
field: "body"
724+
query_vector_builder:
725+
text_embedding:
726+
model_text: "What is Elasticsearch?"
727+
k: 10
728+
num_candidates: 10
729+
similarity: 0.9977
730+
highlight:
731+
fields:
732+
body:
733+
type: "semantic"
734+
number_of_fragments: 3
735+
736+
- match: { hits.total.value: 1 }
737+
- match: { hits.hits.0._id: "doc_1" }
738+
- length: { hits.hits.0.highlight.body: 3 }
739+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
740+
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
741+
- match: { hits.hits.0.highlight.body.2: "For a moment, nothing happened. Then, after a second or so, nothing continued to happen."}

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,5 +649,72 @@ setup:
649649
- length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
650650
- match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
651651

652+
---
653+
"Highlighting with knn with similarity":
654+
- requires:
655+
cluster_features: "semantic_text.highlighter.vector_similarity_support"
656+
reason: semantic highlighter fix for knn with similarity
657+
658+
- do:
659+
index:
660+
index: test-dense-index
661+
id: doc_1
662+
body:
663+
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!", "For a moment, nothing happened. Then, after a second or so, nothing continued to happen." ]
664+
- do:
665+
index:
666+
index: test-dense-index
667+
id: doc_2
668+
body:
669+
body: [ "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws."]
670+
refresh: true
671+
672+
- do:
673+
search:
674+
index: test-dense-index
675+
body:
676+
query:
677+
match_all: { }
678+
highlight:
679+
fields:
680+
body:
681+
type: "semantic"
682+
number_of_fragments: 1
683+
684+
- match: { hits.total.value: 2 }
685+
686+
- match: { hits.hits.0._id: "doc_1" }
687+
- length: { hits.hits.0.highlight: 1 }
688+
- length: { hits.hits.0.highlight.body: 1 }
689+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
652690

691+
- match: { hits.hits.1._id: "doc_2" }
692+
- length: { hits.hits.1.highlight: 1 }
693+
- length: { hits.hits.1.highlight.body: 1 }
694+
- match: { hits.hits.1.highlight.body.0: "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws." }
653695

696+
- do:
697+
search:
698+
index: test-dense-index
699+
body:
700+
query:
701+
knn:
702+
field: "body"
703+
query_vector_builder:
704+
text_embedding:
705+
model_text: "What is Elasticsearch?"
706+
k: 10
707+
num_candidates: 10
708+
similarity: 0.9977
709+
highlight:
710+
fields:
711+
body:
712+
type: "semantic"
713+
number_of_fragments: 3
714+
715+
- match: { hits.total.value: 1 }
716+
- match: { hits.hits.0._id: "doc_1" }
717+
- length: { hits.hits.0.highlight.body: 3 }
718+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
719+
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
720+
- match: { hits.hits.0.highlight.body.2: "For a moment, nothing happened. Then, after a second or so, nothing continued to happen."}

0 commit comments

Comments
 (0)