Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9d6a32e
Added check for blank string to skip generating embeddings with unit …
Samiul-TheSoccerFan Feb 28, 2025
16f0b5a
Adding yaml tests for skipping embedding generation
Samiul-TheSoccerFan Feb 28, 2025
96605bb
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 3, 2025
6403aa0
dynamic update not required if model_settings stays null
Samiul-TheSoccerFan Mar 5, 2025
6e0d484
Updating node feature for handling empty input name and description
Samiul-TheSoccerFan Mar 5, 2025
aeaf117
Update yaml tests with refresh=true
Samiul-TheSoccerFan Mar 5, 2025
bb99b3b
Update unit test to follow more accurate behavior
Samiul-TheSoccerFan Mar 5, 2025
6509870
Added yaml tests for multu chunks
Samiul-TheSoccerFan Mar 5, 2025
3c4c3ed
[CI] Auto commit changes from spotless
Mar 5, 2025
f085df3
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 5, 2025
f7d9359
Adding highlighter yaml tests for empty input
Samiul-TheSoccerFan Mar 5, 2025
285226a
Update docs/changelog/123763.yaml
Samiul-TheSoccerFan Mar 5, 2025
43406db
Update changelog and test reason to have more polished documentation
Samiul-TheSoccerFan Mar 6, 2025
78c5e12
adding input value into the response source and fixing unit tests by …
Samiul-TheSoccerFan Mar 6, 2025
33a533a
Adding highligher test for backward compatibility and refactor existi…
Samiul-TheSoccerFan Mar 6, 2025
cd15c9e
Added bwc tests for empty input and multi chunks
Samiul-TheSoccerFan Mar 7, 2025
2fb0092
Removed reindex for empty input from bwc
Samiul-TheSoccerFan Mar 7, 2025
1a275db
[CI] Auto commit changes from spotless
Mar 7, 2025
7486fe8
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 7, 2025
6123d1a
Fixing yaml test
Samiul-TheSoccerFan Mar 7, 2025
d31d281
Update unit tests helper function to support both format
Samiul-TheSoccerFan Mar 7, 2025
78a390c
[CI] Auto commit changes from spotless
Mar 7, 2025
09a298a
Adding cluster features for bwc
Samiul-TheSoccerFan Mar 7, 2025
72886bf
Centralize logic for assertInference helper
Samiul-TheSoccerFan Mar 7, 2025
1179f84
resolve conflicts from main
Samiul-TheSoccerFan Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/changelog/123763.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pr: 123763
summary: Handle empty input inference
summary: Skip semantic_text embedding generation when no content is provided.
area: Relevance
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
for (String v : values) {
if (v.isBlank()) {
slot.addOrUpdateResponse(
new FieldInferenceResponse(field, sourceField, null, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
new FieldInferenceResponse(field, sourceField, v, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
);
} else {
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ public void testExplicitNull() throws Exception {
// item 3
assertNull(bulkShardRequest.items()[3].getPrimaryResponse());
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request());
assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0);
assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, useLegacyFormat ? null : 0);

// item 4
assertNull(bulkShardRequest.items()[4].getPrimaryResponse());
Expand Down Expand Up @@ -381,18 +381,23 @@ public void testHandleEmptyInput() throws Exception {
ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
try {
BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());
assertNull(bulkShardRequest.getInferenceFieldMap());
assertThat(bulkShardRequest.items().length, equalTo(3));

// Create with Empty string
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", useLegacyFormat ? EXPLICIT_NULL : "", 0);
assertNull(bulkShardRequest.items()[0].getPrimaryResponse());
IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", "", 0);

// Create with whitespace only
assertNull(bulkShardRequest.items()[1].getPrimaryResponse());
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", useLegacyFormat ? EXPLICIT_NULL : " ", 0);
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);

// Update with multiple Whitespaces
assertNull(bulkShardRequest.items()[2].getPrimaryResponse());
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request());
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", useLegacyFormat ? EXPLICIT_NULL : " ", 0);
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);
} finally {
chainExecuted.countDown();
}
Expand Down Expand Up @@ -637,7 +642,7 @@ private static void assertInferenceResults(
IndexRequest request,
String fieldName,
Object expectedOriginalValue,
int expectedChunkCount
Integer expectedChunkCount
) {
final Map<String, Object> requestMap = request.sourceAsMap();
if (useLegacyFormat) {
Expand All @@ -647,12 +652,11 @@ private static void assertInferenceResults(
);

List<Object> chunks = (List<Object>) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap);
if (expectedChunkCount > 0) {
if (expectedChunkCount == null) {
assertNull(chunks);
} else {
assertNotNull(chunks);
assertThat(chunks.size(), equalTo(expectedChunkCount));
} else {
// If the expected chunk count is 0, we expect that no inference has been performed.
assertTrue(chunks == null || chunks.isEmpty());
}
} else {
assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,7 @@ setup:
"Empty semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
index:
Expand All @@ -1020,28 +1020,10 @@ setup:
sparse_field: ""
refresh: true

- do:
search:
index: test-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- match: { hits.total.value: 1 }
- match: { hits.hits.0._source.sparse_field: "" }
- not_exists: hits.hits.0._source._inference_fields

---
"Whitespace-Only semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input

- do:
index:
index: test-index
id: doc_1
id: doc_2
body:
sparse_field: " "
refresh: true
Expand All @@ -1054,15 +1036,17 @@ setup:
query:
match_all: { }

- match: { hits.total.value: 1 }
- match: { hits.hits.0._source.sparse_field: " " }
- match: { hits.total.value: 2 }
- match: { hits.hits.0._source.sparse_field: "" }
- match: { hits.hits.1._source.sparse_field: " " }
- not_exists: hits.hits.0._source._inference_fields
- not_exists: hits.hits.1._source._inference_fields

---
"Reindexing with empty or whitespace semantic_text skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
index:
Expand Down Expand Up @@ -1118,7 +1102,7 @@ setup:
"Empty Multi-Field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
indices.create:
Expand Down Expand Up @@ -1164,7 +1148,7 @@ setup:
"Multi chunks skips empty input embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
index:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -675,3 +675,67 @@ setup:

- match: { hits.total.value: 1 }
- not_exists: hits.hits.0._source._inference_fields

---
"Empty semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: ""
refresh: true

- do:
index:
index: test-index
id: doc_2
body:
sparse_field: " "
refresh: true

- do:
search:
index: test-index
body:
query:
match_all: { }

- match: { hits.total.value: 2 }
- match: { hits.hits.0._source.sparse_field.text: "" }
- length: { hits.hits.0._source.sparse_field.inference.chunks: 0 }
- match: { hits.hits.1._source.sparse_field.text: " " }
- length: { hits.hits.0._source.sparse_field.inference.chunks: 0 }

---
"Multi chunks skips empty input embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: ["some test data", " ", "now with chunks"]
refresh: true

- do:
search:
index: test-index
body:
query:
match_all: { }

- match: { hits.total.value: 1 }

- length: { hits.hits.0._source.sparse_field.inference.chunks: 2 }
- match: { hits.hits.0._source.sparse_field.inference.chunks.0.text: "some test data" }
- exists: hits.hits.0._source.sparse_field.inference.chunks.0.embeddings
- match: { hits.hits.0._source.sparse_field.inference.chunks.1.text: "now with chunks" }
- exists: hits.hits.0._source.sparse_field.inference.chunks.1.embeddings
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ setup:
"Highlighting and multi chunks with empty input":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.

- do:
indices.create:
Expand Down Expand Up @@ -329,70 +329,10 @@ setup:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 1
number_of_fragments: 3

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 1 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "now with chunks" }

- do:
search:
index: test-multi-chunk-index
body:
query:
semantic:
field: "semantic_text_field"
query: "test"
highlight:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2}
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

- do:
search:
index: test-multi-chunk-index
body:
query:
semantic:
field: "semantic_text_field"
query: "test"
highlight:
fields:
semantic_text_field:
type: "semantic"
order: "score"
number_of_fragments: 1

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 1 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "now with chunks" }

- do:
search:
index: test-multi-chunk-index
body:
query:
semantic:
field: "semantic_text_field"
query: "test"
highlight:
fields:
semantic_text_field:
type: "semantic"
order: "score"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2}
- match: { hits.hits.0.highlight.semantic_text_field.0: "now with chunks" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "some test data" }
Original file line number Diff line number Diff line change
Expand Up @@ -243,4 +243,44 @@ setup:
- match: { hits.hits.0.highlight.body.0: "You Know, for Search!" }
- match: { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

---
"Highlighting and multi chunks with empty input":
- do:
indices.create:
index: test-multi-chunk-index
body:
settings:
index.mapping.semantic_text.use_legacy_format: true
mappings:
properties:
semantic_text_field:
type: semantic_text
inference_id: sparse-inference-id

- do:
index:
index: test-multi-chunk-index
id: doc_1
body:
semantic_text_field: ["some test data", " ", "now with chunks"]
refresh: true

- do:
search:
index: test-multi-chunk-index
body:
query:
semantic:
field: "semantic_text_field"
query: "test"
highlight:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 3

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }