Skip to content

Commit ec3c1f7

Browse files
authored
fix(semantic highlighter): add vector similarity queries and bbq_disk support (#138140) (#138521)
1 parent 79d0684 commit ec3c1f7

File tree

12 files changed

+472
-12
lines changed

12 files changed

+472
-12
lines changed

docs/changelog/138140.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 138140
2+
summary: "Fix semantic highlighting when using a `knn` query with minimum `similarity` and when using `bbq_disk`"
3+
area: Relevance
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/search/vectors/IVFKnnFloatVectorQuery.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ public IVFKnnFloatVectorQuery(String field, float[] query, int k, int numCands,
3737
this.query = query;
3838
}
3939

40+
public float[] getQuery() {
41+
return query;
42+
}
43+
4044
@Override
4145
public String toString(String field) {
4246
StringBuilder buffer = new StringBuilder();

server/src/main/java/org/elasticsearch/search/vectors/VectorSimilarityQuery.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,11 @@ public VectorSimilarityQuery(Query innerKnnQuery, float similarity, float docSco
4848
this.innerKnnQuery = innerKnnQuery;
4949
}
5050

51-
// For testing
52-
Query getInnerKnnQuery() {
51+
public Query getInnerKnnQuery() {
5352
return innerKnnQuery;
5453
}
5554

56-
float getSimilarity() {
55+
public float getSimilarity() {
5756
return similarity;
5857
}
5958

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ public class InferenceFeatures implements FeatureSpecification {
3737

3838
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
3939
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT = new NodeFeature("semantic_text.highlighter.default");
40+
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DISKBBQ_SIMILARITY_SUPPORT = new NodeFeature(
41+
"semantic_text.highlighter.bbq_and_similarity_support"
42+
);
4043
private static final NodeFeature TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE = new NodeFeature(
4144
"test_reranking_service.parse_text_as_score"
4245
);
@@ -93,6 +96,7 @@ public Set<NodeFeature> getTestFeatures() {
9396
SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
9497
SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS,
9598
SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT,
99+
SEMANTIC_TEXT_HIGHLIGHTER_DISKBBQ_SIMILARITY_SUPPORT,
96100
SemanticQueryBuilder.SEMANTIC_QUERY_MULTIPLE_INFERENCE_IDS,
97101
SemanticQueryBuilder.SEMANTIC_QUERY_FILTER_FIELD_CAPS_FIX,
98102
InterceptedInferenceQueryBuilder.NEW_SEMANTIC_QUERY_INTERCEPTORS,

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,11 @@
3333
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
3434
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
3535
import org.elasticsearch.search.vectors.DenseVectorQuery;
36+
import org.elasticsearch.search.vectors.IVFKnnFloatVectorQuery;
37+
import org.elasticsearch.search.vectors.RescoreKnnVectorQuery;
3638
import org.elasticsearch.search.vectors.SparseVectorQueryWrapper;
3739
import org.elasticsearch.search.vectors.VectorData;
40+
import org.elasticsearch.search.vectors.VectorSimilarityQuery;
3841
import org.elasticsearch.xcontent.Text;
3942
import org.elasticsearch.xpack.inference.mapper.OffsetSourceField;
4043
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
@@ -266,18 +269,28 @@ public void consumeTerms(Query query, Term... terms) {
266269
super.consumeTerms(query, terms);
267270
}
268271

269-
@Override
270-
public void visitLeaf(Query query) {
272+
private void visitLeaf(Query query, Float similarity) {
271273
if (query instanceof KnnFloatVectorQuery knnQuery) {
272-
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
274+
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), similarity));
273275
} else if (query instanceof KnnByteVectorQuery knnQuery) {
274-
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
276+
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), similarity));
275277
} else if (query instanceof MatchAllDocsQuery) {
276278
queries.add(new MatchAllDocsQuery());
277279
} else if (query instanceof DenseVectorQuery.Floats floatsQuery) {
278-
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(floatsQuery.getQuery()), null));
280+
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(floatsQuery.getQuery()), similarity));
281+
} else if (query instanceof IVFKnnFloatVectorQuery ivfQuery) {
282+
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(ivfQuery.getQuery()), similarity));
283+
} else if (query instanceof RescoreKnnVectorQuery rescoreQuery) {
284+
visitLeaf(rescoreQuery.innerQuery(), similarity);
285+
} else if (query instanceof VectorSimilarityQuery similarityQuery) {
286+
visitLeaf(similarityQuery.getInnerKnnQuery(), similarityQuery.getSimilarity());
279287
}
280288
}
289+
290+
@Override
291+
public void visitLeaf(Query query) {
292+
visitLeaf(query, null);
293+
}
281294
});
282295
return queries;
283296
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272

7373
public class SemanticTextHighlighterTests extends MapperServiceTestCase {
7474
private static final String SEMANTIC_FIELD_E5 = "body-e5";
75+
private static final String SEMANTIC_FIELD_E5_DISK_BBQ = "body-e5-disk_bbq";
7576
private static final String SEMANTIC_FIELD_ELSER = "body-elser";
7677

7778
private final boolean useLegacyFormat;
@@ -208,6 +209,117 @@ public void testNoSemanticField() throws Exception {
208209
);
209210
}
210211

212+
@SuppressWarnings("unchecked")
213+
public void testDenseVectorWithSimilarityThreshold() throws Exception {
214+
var mapperService = createDefaultMapperService(useLegacyFormat);
215+
Map<String, Object> queryMap = (Map<String, Object>) queries.get("dense_vector_1");
216+
float[] vector = readDenseVector(queryMap.get("embeddings"));
217+
var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_E5);
218+
219+
KnnVectorQueryBuilder knnQuery = new KnnVectorQueryBuilder(
220+
fieldType.getEmbeddingsField().fullPath(),
221+
vector,
222+
10,
223+
10,
224+
10f,
225+
null,
226+
0.85f
227+
);
228+
NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), knnQuery, ScoreMode.Max);
229+
var shardRequest = createShardSearchRequest(nestedQueryBuilder);
230+
var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON);
231+
232+
String[] expectedPassages = ((List<String>) queryMap.get("expected_with_similarity_threshold")).toArray(String[]::new);
233+
assertHighlightOneDoc(
234+
mapperService,
235+
shardRequest,
236+
sourceToParse,
237+
SEMANTIC_FIELD_E5,
238+
expectedPassages.length,
239+
HighlightBuilder.Order.SCORE,
240+
expectedPassages
241+
);
242+
}
243+
244+
@SuppressWarnings("unchecked")
245+
public void testDenseVectorWithDiskBBQandSimilarityThreshold() throws Exception {
246+
var mapperService = createDefaultMapperService(useLegacyFormat);
247+
Map<String, Object> queryMap = (Map<String, Object>) queries.get("dense_vector_1");
248+
float[] vector = readDenseVector(queryMap.get("embeddings"));
249+
var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup()
250+
.getFieldType(SEMANTIC_FIELD_E5_DISK_BBQ);
251+
252+
KnnVectorQueryBuilder knnQuery = new KnnVectorQueryBuilder(
253+
fieldType.getEmbeddingsField().fullPath(),
254+
vector,
255+
10,
256+
10,
257+
10f,
258+
null,
259+
0.85f
260+
);
261+
NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), knnQuery, ScoreMode.Max);
262+
var shardRequest = createShardSearchRequest(nestedQueryBuilder);
263+
var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON);
264+
265+
String[] expectedPassages = ((List<String>) queryMap.get("expected_with_similarity_threshold")).toArray(String[]::new);
266+
assertHighlightOneDoc(
267+
mapperService,
268+
shardRequest,
269+
sourceToParse,
270+
SEMANTIC_FIELD_E5_DISK_BBQ,
271+
expectedPassages.length,
272+
HighlightBuilder.Order.SCORE,
273+
expectedPassages
274+
);
275+
}
276+
277+
@SuppressWarnings("unchecked")
278+
public void testDenseVectorWithDiskBBQ() throws Exception {
279+
var mapperService = createDefaultMapperService(useLegacyFormat);
280+
Map<String, Object> queryMap = (Map<String, Object>) queries.get("dense_vector_1");
281+
float[] vector = readDenseVector(queryMap.get("embeddings"));
282+
var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup()
283+
.getFieldType(SEMANTIC_FIELD_E5_DISK_BBQ);
284+
285+
KnnVectorQueryBuilder knnQuery = new KnnVectorQueryBuilder(
286+
fieldType.getEmbeddingsField().fullPath(),
287+
vector,
288+
10,
289+
10,
290+
10f,
291+
null,
292+
null
293+
);
294+
NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), knnQuery, ScoreMode.Max);
295+
var shardRequest = createShardSearchRequest(nestedQueryBuilder);
296+
var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON);
297+
298+
String[] expectedScorePassages = ((List<String>) queryMap.get("expected_by_score")).toArray(String[]::new);
299+
for (int i = 0; i < expectedScorePassages.length; i++) {
300+
assertHighlightOneDoc(
301+
mapperService,
302+
shardRequest,
303+
sourceToParse,
304+
SEMANTIC_FIELD_E5_DISK_BBQ,
305+
i + 1,
306+
HighlightBuilder.Order.SCORE,
307+
Arrays.copyOfRange(expectedScorePassages, 0, i + 1)
308+
);
309+
}
310+
311+
String[] expectedOffsetPassages = ((List<String>) queryMap.get("expected_by_offset")).toArray(String[]::new);
312+
assertHighlightOneDoc(
313+
mapperService,
314+
shardRequest,
315+
sourceToParse,
316+
SEMANTIC_FIELD_E5_DISK_BBQ,
317+
expectedOffsetPassages.length,
318+
HighlightBuilder.Order.NONE,
319+
expectedOffsetPassages
320+
);
321+
}
322+
211323
private MapperService createDefaultMapperService(boolean useLegacyFormat) throws IOException {
212324
var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json"));
213325
var settings = Settings.builder()

x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/mappings.json

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"properties": {
44
"body": {
55
"type": "text",
6-
"copy_to": ["body-elser", "body-e5"]
6+
"copy_to": ["body-elser", "body-e5", "body-e5-disk_bbq"]
77
},
88
"body-e5": {
99
"type": "semantic_text",
@@ -15,6 +15,21 @@
1515
"element_type": "float"
1616
}
1717
},
18+
"body-e5-disk_bbq": {
19+
"type": "semantic_text",
20+
"inference_id": ".multilingual-e5-small-elasticsearch",
21+
"model_settings": {
22+
"task_type": "text_embedding",
23+
"dimensions": 384,
24+
"similarity": "cosine",
25+
"element_type": "float"
26+
},
27+
"index_options": {
28+
"dense_vector": {
29+
"type": "bbq_disk"
30+
}
31+
}
32+
},
1833
"body-elser": {
1934
"type": "semantic_text",
2035
"inference_id": ".elser-2-elasticsearch",
@@ -24,4 +39,4 @@
2439
}
2540
}
2641
}
27-
}
42+
}

x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,9 @@
399399
"After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n",
400400
"In March 2001, Bertrand Delanoë became the first socialist mayor. He was re-elected in March 2008. In 2007, in an effort to reduce car traffic, he introduced the Vélib', a system which rents bicycles. Bertrand Delanoë also transformed a section of the highway along the Left Bank of the Seine into an urban promenade and park, the Promenade des Berges de la Seine, which he inaugurated in June 2013.\n\n\nIn 2007, President Nicolas Sarkozy launched the Grand Paris project, to integrate Paris more closely with the towns in the region around it. After many modifications, the new area, named the Metropolis of Grand Paris, with a population of 6.7 million, was created on 1 January 2016. In 2011, the City of Paris and the national government approved the plans for the Grand Paris Express, totalling 205 km (127 mi) of automated metro lines to connect Paris, the innermost three departments around Paris, airports and high-speed rail (TGV) stations, at an estimated cost of €35 billion. The system is scheduled to be completed by 2030.\n\n\nIn January 2015, Al-Qaeda in the Arabian Peninsula claimed attacks across the Paris region. 1.5 million people marched in Paris in a show of solidarity against terrorism and in support of freedom of speech. In November of the same year, terrorist attacks, claimed by ISIL, killed 130 people and injured more than 350.\n\n\n",
401401
"Bal-musette is a style of French music and dance that first became popular in Paris in the 1870s and 1880s; by 1880 Paris had some 150 dance halls. Patrons danced the bourrée to the accompaniment of the cabrette (a bellows-blown bagpipe locally called a \"musette\") and often the vielle à roue (hurdy-gurdy) in the cafés and bars of the city. Parisian and Italian musicians who played the accordion adopted the style and established themselves in Auvergnat bars, and Paris became a major centre for jazz and still attracts jazz musicians from all around the world to its clubs and cafés.\n\n\nParis is the spiritual home of gypsy jazz in particular, and many of the Parisian jazzmen who developed in the first half of the 20th century began by playing Bal-musette in the city. Django Reinhardt rose to fame in Paris, having moved to the 18th arrondissement in a caravan as a young boy, and performed with violinist Stéphane Grappelli and their Quintette du Hot Club de France in the 1930s and 1940s.\n\n\nImmediately after the War the Saint-Germain-des-Pres quarter and the nearby Saint-Michel quarter became home to many small jazz clubs, including the Caveau des Lorientais, the Club Saint-Germain, the Rose Rouge, the Vieux-Colombier, and the most famous, Le Tabou. They introduced Parisians to the music of Claude Luter, Boris Vian, Sydney Bechet, Mezz Mezzrow, and Henri Salvador. "
402+
],
403+
"expected_with_similarity_threshold": [
404+
"\nParis (.mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}French pronunciation: ⓘ) is the capital and largest city of France. With an estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 (41 sq mi), Paris is the fourth-largest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. Because of its leading role in the arts and sciences and its early adaptation of extensive street lighting, it became known as the City of Light in the 19th century.\n\n\nThe City of Paris is the centre of the Île-de-France region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19% of the population of France. The Paris Region had a nominal GDP of €765 billion (US$1.064 trillion when adjusted for PPP) in 2021, the highest in the European Union. According to the Economist Intelligence Unit Worldwide Cost of Living Survey, in 2022, Paris was the city with the ninth-highest cost of living in the world.\n\n\n"
402405
]
403406
},
404407
"sparse_vector_1": {
@@ -464,4 +467,4 @@
464467
"Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n"
465468
]
466469
}
467-
}
470+
}

0 commit comments

Comments
 (0)