diff --git a/docs/changelog/123763.yaml b/docs/changelog/123763.yaml new file mode 100644 index 0000000000000..3b9428e0de7bc --- /dev/null +++ b/docs/changelog/123763.yaml @@ -0,0 +1,5 @@ +pr: 123763 +summary: Skip semantic_text embedding generation when no content is provided. +area: Relevance +type: enhancement +issues: [] diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java index 8ebd439973dc3..1015fae1449e1 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java @@ -50,7 +50,8 @@ public Set getTestFeatures() { SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT, SEMANTIC_KNN_FILTER_FIX, TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE, - SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT + SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT, + SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT ); } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java index b8d2e04ed4628..16336d941fd9d 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java @@ -563,7 +563,7 @@ private Map> createFieldInferenceRequests(Bu } continue; } - ensureResponseAccumulatorSlot(itemIndex); + var slot = ensureResponseAccumulatorSlot(itemIndex); final List values; try { values = SemanticTextUtils.nodeStringValues(field, valueObj); @@ -580,7 +580,13 @@ private Map> createFieldInferenceRequests(Bu List fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>()); int offsetAdjustment = 0; for (String v : values) { - fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment)); + if (v.isBlank()) { + slot.addOrUpdateResponse( + new FieldInferenceResponse(field, sourceField, v, order++, 0, null, EMPTY_CHUNKED_INFERENCE) + ); + } else { + fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment)); + } // When using the inference metadata fields format, all the input values are concatenated so that the // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index c482b3d7de439..8079df3d7b16d 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -117,6 +117,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie public static final NodeFeature SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX = new NodeFeature( "semantic_text.always_emit_inference_id_fix" ); + public static final NodeFeature SEMANTIC_TEXT_HANDLE_EMPTY_INPUT = new NodeFeature("semantic_text.handle_empty_input"); public static final NodeFeature SEMANTIC_TEXT_SKIP_INFERENCE_FIELDS = new NodeFeature("semantic_text.skip_inference_fields"); public static final NodeFeature SEMANTIC_TEXT_BIT_VECTOR_SUPPORT = new NodeFeature("semantic_text.bit_vector_support"); @@ -403,7 +404,7 @@ void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextFiel } final SemanticTextFieldMapper mapper; - if (fieldType().getModelSettings() == null) { + if (fieldType().getModelSettings() == null && field.inference().modelSettings() != null) { mapper = addDynamicUpdate(context, field); } else { Conflicts conflicts = new Conflicts(fullFieldName); diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java index 15595e1539636..61b7e08b6fbaa 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java @@ -335,7 +335,7 @@ public void testExplicitNull() throws Exception { // item 3 assertNull(bulkShardRequest.items()[3].getPrimaryResponse()); actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request()); - assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0); + assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, null); // item 4 assertNull(bulkShardRequest.items()[4].getPrimaryResponse()); @@ -368,6 +368,59 @@ public void testExplicitNull() throws Exception { awaitLatch(chainExecuted, 10, TimeUnit.SECONDS); } + @SuppressWarnings({ "unchecked", "rawtypes" }) + public void testHandleEmptyInput() throws Exception { + StaticModel model = StaticModel.createRandomInstance(); + ShardBulkInferenceActionFilter filter = createFilter( + threadPool, + Map.of(model.getInferenceEntityId(), model), + randomIntBetween(1, 10), + useLegacyFormat, + true + ); + + CountDownLatch chainExecuted = new CountDownLatch(1); + ActionFilterChain actionFilterChain = (task, action, request, listener) -> { + try { + BulkShardRequest bulkShardRequest = (BulkShardRequest) request; + assertNull(bulkShardRequest.getInferenceFieldMap()); + assertThat(bulkShardRequest.items().length, equalTo(3)); + + // Create with Empty string + assertNull(bulkShardRequest.items()[0].getPrimaryResponse()); + IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request()); + assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", "", 0); + + // Create with whitespace only + assertNull(bulkShardRequest.items()[1].getPrimaryResponse()); + actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request()); + assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0); + + // Update with multiple Whitespaces + assertNull(bulkShardRequest.items()[2].getPrimaryResponse()); + actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request()); + assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0); + } finally { + chainExecuted.countDown(); + } + }; + ActionListener actionListener = mock(ActionListener.class); + Task task = mock(Task.class); + Map inferenceFieldMap = Map.of( + "semantic_text_field", + new InferenceFieldMetadata("semantic_text_field", model.getInferenceEntityId(), new String[] { "semantic_text_field" }) + ); + + BulkItemRequest[] items = new BulkItemRequest[3]; + items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("semantic_text_field", ""))); + items[1] = new BulkItemRequest(1, new IndexRequest("index").source(Map.of("semantic_text_field", " "))); + items[2] = new BulkItemRequest(2, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("semantic_text_field", " ")))); + BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items); + request.setInferenceFieldMap(inferenceFieldMap); + filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain); + awaitLatch(chainExecuted, 10, TimeUnit.SECONDS); + } + @SuppressWarnings({ "unchecked", "rawtypes" }) public void testManyRandomDocs() throws Exception { Map inferenceModelMap = new HashMap<>(); @@ -591,7 +644,7 @@ private static void assertInferenceResults( IndexRequest request, String fieldName, Object expectedOriginalValue, - int expectedChunkCount + Integer expectedChunkCount ) { final Map requestMap = request.sourceAsMap(); if (useLegacyFormat) { @@ -601,13 +654,11 @@ private static void assertInferenceResults( ); List chunks = (List) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap); - if (expectedChunkCount > 0) { + if (expectedChunkCount == null) { + assertNull(chunks); + } else { assertNotNull(chunks); assertThat(chunks.size(), equalTo(expectedChunkCount)); - } else { - // If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be - // transformed, and thus the semantic text field structure should not be created. - assertNull(chunks); } } else { assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue)); @@ -627,8 +678,11 @@ private static void assertInferenceResults( inferenceMetadataFields, EXPLICIT_NULL ); + + // When using the new format, the chunks field should always exist + int expectedSize = expectedChunkCount == null ? 0 : expectedChunkCount; assertNotNull(chunks); - assertThat(chunks.size(), equalTo(expectedChunkCount)); + assertThat(chunks.size(), equalTo(expectedSize)); } } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml index 5a03418f2042e..68c2658c66234 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml @@ -1005,3 +1005,174 @@ setup: - match: { hits.hits.0._source.dense_field: "another inference test" } - match: { hits.hits.0._source.non_inference_field: "non inference test" } - exists: hits.hits.0._source._inference_fields + +--- +"Empty semantic_text field skips embedding generation": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: "" + refresh: true + + - do: + index: + index: test-index + id: doc_2 + body: + sparse_field: " " + refresh: true + + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._source.sparse_field: "" } + - match: { hits.hits.1._source.sparse_field: " " } + - not_exists: hits.hits.0._source._inference_fields + - not_exists: hits.hits.1._source._inference_fields + +--- +"Reindexing with empty or whitespace semantic_text skips embedding generation": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: " " + refresh: true + + - do: + indices.create: + index: destination-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + + - do: + reindex: + wait_for_completion: true + body: + source: + index: test-index + dest: + index: destination-index + refresh: true + + - do: + get: + index: destination-index + id: doc_1 + + - match: { _source.sparse_field: " " } + + - do: + search: + index: destination-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - not_exists: hits.hits.0._source._inference_fields + +--- +"Empty Multi-Field skips embedding generation": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + indices.create: + index: test-multi-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + field: + type: semantic_text + inference_id: sparse-inference-id + fields: + sparse: + type: semantic_text + inference_id: sparse-inference-id + + - do: + bulk: + index: test-multi-index + refresh: true + body: | + {"index":{"_id": "1"}} + {"field": ["you know, for testing", "now with chunks"]} + {"index":{"_id": "2"}} + {"field": ["", " "]} + + - do: + search: + index: test-multi-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - exists: hits.hits.0._source._inference_fields + - not_exists: hits.hits.1._source._inference_fields + +--- +"Multi chunks skips empty input embedding generation": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: ["some test data", " ", "now with chunks"] + refresh: true + + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 20 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 35 } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml index 5f87942b2c710..75df28148c148 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml @@ -675,3 +675,67 @@ setup: - match: { hits.total.value: 1 } - not_exists: hits.hits.0._source._inference_fields + +--- +"Empty semantic_text field skips embedding generation": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: "" + refresh: true + + - do: + index: + index: test-index + id: doc_2 + body: + sparse_field: " " + refresh: true + + - do: + search: + index: test-index + body: + query: + match_all: { } + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._source.sparse_field.text: "" } + - length: { hits.hits.0._source.sparse_field.inference.chunks: 0 } + - match: { hits.hits.1._source.sparse_field.text: " " } + - length: { hits.hits.1._source.sparse_field.inference.chunks: 0 } + +--- +"Multi chunks skips empty input embedding generation": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: ["some test data", " ", "now with chunks"] + refresh: true + + - do: + search: + index: test-index + body: + query: + match_all: { } + + - match: { hits.total.value: 1 } + + - length: { hits.hits.0._source.sparse_field.inference.chunks: 2 } + - match: { hits.hits.0._source.sparse_field.inference.chunks.0.text: "some test data" } + - exists: hits.hits.0._source.sparse_field.inference.chunks.0.embeddings + - match: { hits.hits.0._source.sparse_field.inference.chunks.1.text: "now with chunks" } + - exists: hits.hits.0._source.sparse_field.inference.chunks.1.embeddings diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml index 7765795ebfbdc..f1a33c262f9d7 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml @@ -291,3 +291,48 @@ setup: - match: { hits.hits.0._id: "doc_1" } - not_exists: hits.hits.0.highlight.title +--- +"Highlighting and multi chunks with empty input": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + + - do: + indices.create: + index: test-multi-chunk-index + body: + settings: + index.mapping.semantic_text.use_legacy_format: false + mappings: + properties: + semantic_text_field: + type: semantic_text + inference_id: sparse-inference-id + + - do: + index: + index: test-multi-chunk-index + id: doc_1 + body: + semantic_text_field: ["some test data", " ", "now with chunks"] + refresh: true + + - do: + search: + index: test-multi-chunk-index + body: + query: + semantic: + field: "semantic_text_field" + query: "test" + highlight: + fields: + semantic_text_field: + type: "semantic" + number_of_fragments: 3 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.semantic_text_field: 2 } + - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" } + - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml index 3b2c8ba4f8082..52fbf4d2723ac 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml @@ -243,4 +243,48 @@ setup: - match: { hits.hits.0.highlight.body.0: "You Know, for Search!" } - match: { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } +--- +"Highlighting and multi chunks with empty input": + - requires: + cluster_features: "semantic_text.handle_empty_input" + reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0. + - do: + indices.create: + index: test-multi-chunk-index + body: + settings: + index.mapping.semantic_text.use_legacy_format: true + mappings: + properties: + semantic_text_field: + type: semantic_text + inference_id: sparse-inference-id + + - do: + index: + index: test-multi-chunk-index + id: doc_1 + body: + semantic_text_field: ["some test data", " ", "now with chunks"] + refresh: true + + - do: + search: + index: test-multi-chunk-index + body: + query: + semantic: + field: "semantic_text_field" + query: "test" + highlight: + fields: + semantic_text_field: + type: "semantic" + number_of_fragments: 3 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.semantic_text_field: 2 } + - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" } + - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }