Skip to content

Commit be082ac

Browse files
Semantic_text match_all with Highlighter (elastic#128702) (elastic#128903)
* initial implementation for match_All * reformat * [CI] Auto commit changes from spotless * Excluding matchAllintercepter * Adding matchAllDocs support for vector fields * [CI] Auto commit changes from spotless * Remove previous implementation * Adding yaml tests for match_all * fixed yaml tests * Update docs/changelog/128702.yaml * Update changelog * changelog - update summary * Fix wrong inference names for the yaml tests --------- Co-authored-by: elasticsearchmachine <[email protected]> Co-authored-by: Elastic Machine <[email protected]> (cherry picked from commit d1b5532) # Conflicts: # x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
1 parent 72026e3 commit be082ac

File tree

5 files changed

+295
-1
lines changed

5 files changed

+295
-1
lines changed

docs/changelog/128702.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 128702
2+
summary: Fix missing highlighting in `match_all` queries for `semantic_text` fields
3+
area: Search
4+
type: bug
5+
issues: []

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ public Set<NodeFeature> getFeatures() {
4343
private static final NodeFeature TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE = new NodeFeature(
4444
"test_reranking_service.parse_text_as_score"
4545
);
46+
private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter");
4647

4748
@Override
4849
public Set<NodeFeature> getTestFeatures() {
@@ -65,7 +66,8 @@ public Set<NodeFeature> getTestFeatures() {
6566
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE,
6667
SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT,
6768
SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT,
68-
SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG
69+
SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG,
70+
SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER
6971
);
7072
}
7173
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.apache.lucene.search.IndexSearcher;
1616
import org.apache.lucene.search.KnnByteVectorQuery;
1717
import org.apache.lucene.search.KnnFloatVectorQuery;
18+
import org.apache.lucene.search.MatchAllDocsQuery;
1819
import org.apache.lucene.search.Query;
1920
import org.apache.lucene.search.QueryVisitor;
2021
import org.apache.lucene.search.ScoreMode;
@@ -267,6 +268,8 @@ public void visitLeaf(Query query) {
267268
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
268269
} else if (query instanceof KnnByteVectorQuery knnQuery) {
269270
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
271+
} else if (query instanceof MatchAllDocsQuery) {
272+
queries.add(new MatchAllDocsQuery());
270273
}
271274
}
272275
});
@@ -293,6 +296,13 @@ public QueryVisitor getSubVisitor(BooleanClause.Occur occur, Query parent) {
293296
}
294297
return this;
295298
}
299+
300+
@Override
301+
public void visitLeaf(Query query) {
302+
if (query instanceof MatchAllDocsQuery) {
303+
queries.add(new MatchAllDocsQuery());
304+
}
305+
}
296306
});
297307
return queries;
298308
}

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,3 +336,133 @@ setup:
336336
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
337337
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
338338
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
339+
340+
---
341+
"Highlighting with match_all query":
342+
- requires:
343+
cluster_features: "semantic_text.match_all_highlighter"
344+
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
345+
346+
- do:
347+
search:
348+
index: test-sparse-index
349+
body:
350+
query:
351+
match_all: {}
352+
highlight:
353+
fields:
354+
body:
355+
type: "semantic"
356+
number_of_fragments: 2
357+
358+
- match: { hits.total.value: 1 }
359+
- match: { hits.hits.0._id: "doc_1" }
360+
- length: { hits.hits.0.highlight.body: 2 }
361+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
362+
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
363+
364+
- do:
365+
search:
366+
index: test-dense-index
367+
body:
368+
query:
369+
match_all: {}
370+
highlight:
371+
fields:
372+
body:
373+
type: "semantic"
374+
number_of_fragments: 2
375+
376+
- match: { hits.total.value: 1 }
377+
- match: { hits.hits.0._id: "doc_1" }
378+
- length: { hits.hits.0.highlight.body: 2 }
379+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
380+
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
381+
382+
---
383+
"Highlighting with match_all and multi chunks with empty input":
384+
- requires:
385+
cluster_features: "semantic_text.match_all_highlighter"
386+
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
387+
388+
- do:
389+
indices.create:
390+
index: test-index-sparse
391+
body:
392+
settings:
393+
index.mapping.semantic_text.use_legacy_format: false
394+
mappings:
395+
properties:
396+
semantic_text_field:
397+
type: semantic_text
398+
inference_id: sparse-inference-id
399+
text_field:
400+
type: text
401+
402+
- do:
403+
index:
404+
index: test-index-sparse
405+
id: doc_1
406+
body:
407+
semantic_text_field: [ "some test data", " ", "now with chunks" ]
408+
text_field: "some test data"
409+
refresh: true
410+
411+
- do:
412+
search:
413+
index: test-index-sparse
414+
body:
415+
query:
416+
match_all: {}
417+
highlight:
418+
fields:
419+
semantic_text_field:
420+
type: "semantic"
421+
number_of_fragments: 2
422+
423+
- match: { hits.total.value: 1 }
424+
- match: { hits.hits.0._id: "doc_1" }
425+
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
426+
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
427+
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
428+
429+
- do:
430+
indices.create:
431+
index: test-index-dense
432+
body:
433+
settings:
434+
index.mapping.semantic_text.use_legacy_format: false
435+
mappings:
436+
properties:
437+
semantic_text_field:
438+
type: semantic_text
439+
inference_id: dense-inference-id
440+
text_field:
441+
type: text
442+
443+
- do:
444+
index:
445+
index: test-index-dense
446+
id: doc_1
447+
body:
448+
semantic_text_field: [ "some test data", " ", "now with chunks" ]
449+
text_field: "some test data"
450+
refresh: true
451+
452+
- do:
453+
search:
454+
index: test-index-dense
455+
body:
456+
query:
457+
match_all: {}
458+
highlight:
459+
fields:
460+
semantic_text_field:
461+
type: "semantic"
462+
number_of_fragments: 2
463+
464+
- match: { hits.total.value: 1 }
465+
- match: { hits.hits.0._id: "doc_1" }
466+
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
467+
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
468+
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,3 +288,150 @@ setup:
288288
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
289289
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
290290
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
291+
292+
---
293+
"Highlighting with match_all query":
294+
- requires:
295+
cluster_features: "semantic_text.match_all_highlighter"
296+
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
297+
298+
- do:
299+
index:
300+
index: test-sparse-index
301+
id: doc_1
302+
body:
303+
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
304+
refresh: true
305+
306+
- do:
307+
search:
308+
index: test-sparse-index
309+
body:
310+
query:
311+
match_all: {}
312+
highlight:
313+
fields:
314+
body:
315+
type: "semantic"
316+
number_of_fragments: 2
317+
318+
- match: { hits.total.value: 1 }
319+
- match: { hits.hits.0._id: "doc_1" }
320+
- length: { hits.hits.0.highlight.body: 2 }
321+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
322+
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
323+
324+
- do:
325+
index:
326+
index: test-dense-index
327+
id: doc_1
328+
body:
329+
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
330+
refresh: true
331+
332+
- do:
333+
search:
334+
index: test-dense-index
335+
body:
336+
query:
337+
match_all: {}
338+
highlight:
339+
fields:
340+
body:
341+
type: "semantic"
342+
number_of_fragments: 2
343+
344+
- match: { hits.total.value: 1 }
345+
- match: { hits.hits.0._id: "doc_1" }
346+
- length: { hits.hits.0.highlight.body: 2 }
347+
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
348+
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
349+
350+
---
351+
"Highlighting with match_all and multi chunks with empty input":
352+
- requires:
353+
cluster_features: "semantic_text.match_all_highlighter"
354+
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
355+
356+
- do:
357+
indices.create:
358+
index: test-index-sparse
359+
body:
360+
settings:
361+
index.mapping.semantic_text.use_legacy_format: true
362+
mappings:
363+
properties:
364+
semantic_text_field:
365+
type: semantic_text
366+
inference_id: sparse-inference-id
367+
text_field:
368+
type: text
369+
370+
- do:
371+
index:
372+
index: test-index-sparse
373+
id: doc_1
374+
body:
375+
semantic_text_field: [ "some test data", " ", "now with chunks" ]
376+
text_field: "some test data"
377+
refresh: true
378+
379+
- do:
380+
search:
381+
index: test-index-sparse
382+
body:
383+
query:
384+
match_all: {}
385+
highlight:
386+
fields:
387+
semantic_text_field:
388+
type: "semantic"
389+
number_of_fragments: 2
390+
391+
- match: { hits.total.value: 1 }
392+
- match: { hits.hits.0._id: "doc_1" }
393+
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
394+
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
395+
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
396+
397+
- do:
398+
indices.create:
399+
index: test-index-dense
400+
body:
401+
settings:
402+
index.mapping.semantic_text.use_legacy_format: true
403+
mappings:
404+
properties:
405+
semantic_text_field:
406+
type: semantic_text
407+
inference_id: dense-inference-id
408+
text_field:
409+
type: text
410+
411+
- do:
412+
index:
413+
index: test-index-dense
414+
id: doc_1
415+
body:
416+
semantic_text_field: [ "some test data", " ", "now with chunks" ]
417+
text_field: "some test data"
418+
refresh: true
419+
420+
- do:
421+
search:
422+
index: test-index-dense
423+
body:
424+
query:
425+
match_all: {}
426+
highlight:
427+
fields:
428+
semantic_text_field:
429+
type: "semantic"
430+
number_of_fragments: 2
431+
432+
- match: { hits.total.value: 1 }
433+
- match: { hits.hits.0._id: "doc_1" }
434+
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
435+
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
436+
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
437+

0 commit comments

Comments
 (0)