Skip to content

Commit f0d5220

Browse files
Samiul-TheSoccerFanelasticmachineelasticsearchmachine
authored
Handle empty input inference (#123763)
* Added check for blank string to skip generating embeddings with unit test * Adding yaml tests for skipping embedding generation * dynamic update not required if model_settings stays null * Updating node feature for handling empty input name and description * Update yaml tests with refresh=true * Update unit test to follow more accurate behavior * Added yaml tests for multu chunks * [CI] Auto commit changes from spotless * Adding highlighter yaml tests for empty input * Update docs/changelog/123763.yaml * Update changelog and test reason to have more polished documentation * adding input value into the response source and fixing unit tests by reformating * Adding highligher test for backward compatibility and refactor existing test * Added bwc tests for empty input and multi chunks * Removed reindex for empty input from bwc * [CI] Auto commit changes from spotless * Fixing yaml test * Update unit tests helper function to support both format * [CI] Auto commit changes from spotless * Adding cluster features for bwc * Centralize logic for assertInference helper --------- Co-authored-by: Elastic Machine <[email protected]> Co-authored-by: elasticsearchmachine <[email protected]>
1 parent 5752c71 commit f0d5220

File tree

9 files changed

+403
-12
lines changed

9 files changed

+403
-12
lines changed

docs/changelog/123763.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 123763
2+
summary: Skip semantic_text embedding generation when no content is provided.
3+
area: Relevance
4+
type: enhancement
5+
issues: []

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ public Set<NodeFeature> getTestFeatures() {
5050
SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT,
5151
SEMANTIC_KNN_FILTER_FIX,
5252
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE,
53-
SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT
53+
SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT,
54+
SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT
5455
);
5556
}
5657
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
563563
}
564564
continue;
565565
}
566-
ensureResponseAccumulatorSlot(itemIndex);
566+
var slot = ensureResponseAccumulatorSlot(itemIndex);
567567
final List<String> values;
568568
try {
569569
values = SemanticTextUtils.nodeStringValues(field, valueObj);
@@ -580,7 +580,13 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
580580
List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
581581
int offsetAdjustment = 0;
582582
for (String v : values) {
583-
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
583+
if (v.isBlank()) {
584+
slot.addOrUpdateResponse(
585+
new FieldInferenceResponse(field, sourceField, v, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
586+
);
587+
} else {
588+
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
589+
}
584590

585591
// When using the inference metadata fields format, all the input values are concatenated so that the
586592
// chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
117117
public static final NodeFeature SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX = new NodeFeature(
118118
"semantic_text.always_emit_inference_id_fix"
119119
);
120+
public static final NodeFeature SEMANTIC_TEXT_HANDLE_EMPTY_INPUT = new NodeFeature("semantic_text.handle_empty_input");
120121
public static final NodeFeature SEMANTIC_TEXT_SKIP_INFERENCE_FIELDS = new NodeFeature("semantic_text.skip_inference_fields");
121122
public static final NodeFeature SEMANTIC_TEXT_BIT_VECTOR_SUPPORT = new NodeFeature("semantic_text.bit_vector_support");
122123

@@ -403,7 +404,7 @@ void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextFiel
403404
}
404405

405406
final SemanticTextFieldMapper mapper;
406-
if (fieldType().getModelSettings() == null) {
407+
if (fieldType().getModelSettings() == null && field.inference().modelSettings() != null) {
407408
mapper = addDynamicUpdate(context, field);
408409
} else {
409410
Conflicts conflicts = new Conflicts(fullFieldName);

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java

Lines changed: 62 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ public void testExplicitNull() throws Exception {
335335
// item 3
336336
assertNull(bulkShardRequest.items()[3].getPrimaryResponse());
337337
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request());
338-
assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0);
338+
assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, null);
339339

340340
// item 4
341341
assertNull(bulkShardRequest.items()[4].getPrimaryResponse());
@@ -368,6 +368,59 @@ public void testExplicitNull() throws Exception {
368368
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
369369
}
370370

371+
@SuppressWarnings({ "unchecked", "rawtypes" })
372+
public void testHandleEmptyInput() throws Exception {
373+
StaticModel model = StaticModel.createRandomInstance();
374+
ShardBulkInferenceActionFilter filter = createFilter(
375+
threadPool,
376+
Map.of(model.getInferenceEntityId(), model),
377+
randomIntBetween(1, 10),
378+
useLegacyFormat,
379+
true
380+
);
381+
382+
CountDownLatch chainExecuted = new CountDownLatch(1);
383+
ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
384+
try {
385+
BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
386+
assertNull(bulkShardRequest.getInferenceFieldMap());
387+
assertThat(bulkShardRequest.items().length, equalTo(3));
388+
389+
// Create with Empty string
390+
assertNull(bulkShardRequest.items()[0].getPrimaryResponse());
391+
IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());
392+
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", "", 0);
393+
394+
// Create with whitespace only
395+
assertNull(bulkShardRequest.items()[1].getPrimaryResponse());
396+
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
397+
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);
398+
399+
// Update with multiple Whitespaces
400+
assertNull(bulkShardRequest.items()[2].getPrimaryResponse());
401+
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request());
402+
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);
403+
} finally {
404+
chainExecuted.countDown();
405+
}
406+
};
407+
ActionListener actionListener = mock(ActionListener.class);
408+
Task task = mock(Task.class);
409+
Map<String, InferenceFieldMetadata> inferenceFieldMap = Map.of(
410+
"semantic_text_field",
411+
new InferenceFieldMetadata("semantic_text_field", model.getInferenceEntityId(), new String[] { "semantic_text_field" })
412+
);
413+
414+
BulkItemRequest[] items = new BulkItemRequest[3];
415+
items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("semantic_text_field", "")));
416+
items[1] = new BulkItemRequest(1, new IndexRequest("index").source(Map.of("semantic_text_field", " ")));
417+
items[2] = new BulkItemRequest(2, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("semantic_text_field", " "))));
418+
BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
419+
request.setInferenceFieldMap(inferenceFieldMap);
420+
filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
421+
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
422+
}
423+
371424
@SuppressWarnings({ "unchecked", "rawtypes" })
372425
public void testManyRandomDocs() throws Exception {
373426
Map<String, StaticModel> inferenceModelMap = new HashMap<>();
@@ -591,7 +644,7 @@ private static void assertInferenceResults(
591644
IndexRequest request,
592645
String fieldName,
593646
Object expectedOriginalValue,
594-
int expectedChunkCount
647+
Integer expectedChunkCount
595648
) {
596649
final Map<String, Object> requestMap = request.sourceAsMap();
597650
if (useLegacyFormat) {
@@ -601,13 +654,11 @@ private static void assertInferenceResults(
601654
);
602655

603656
List<Object> chunks = (List<Object>) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap);
604-
if (expectedChunkCount > 0) {
657+
if (expectedChunkCount == null) {
658+
assertNull(chunks);
659+
} else {
605660
assertNotNull(chunks);
606661
assertThat(chunks.size(), equalTo(expectedChunkCount));
607-
} else {
608-
// If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be
609-
// transformed, and thus the semantic text field structure should not be created.
610-
assertNull(chunks);
611662
}
612663
} else {
613664
assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue));
@@ -627,8 +678,11 @@ private static void assertInferenceResults(
627678
inferenceMetadataFields,
628679
EXPLICIT_NULL
629680
);
681+
682+
// When using the new format, the chunks field should always exist
683+
int expectedSize = expectedChunkCount == null ? 0 : expectedChunkCount;
630684
assertNotNull(chunks);
631-
assertThat(chunks.size(), equalTo(expectedChunkCount));
685+
assertThat(chunks.size(), equalTo(expectedSize));
632686
}
633687
}
634688

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,3 +1005,174 @@ setup:
10051005
- match: { hits.hits.0._source.dense_field: "another inference test" }
10061006
- match: { hits.hits.0._source.non_inference_field: "non inference test" }
10071007
- exists: hits.hits.0._source._inference_fields
1008+
1009+
---
1010+
"Empty semantic_text field skips embedding generation":
1011+
- requires:
1012+
cluster_features: "semantic_text.handle_empty_input"
1013+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1014+
1015+
- do:
1016+
index:
1017+
index: test-index
1018+
id: doc_1
1019+
body:
1020+
sparse_field: ""
1021+
refresh: true
1022+
1023+
- do:
1024+
index:
1025+
index: test-index
1026+
id: doc_2
1027+
body:
1028+
sparse_field: " "
1029+
refresh: true
1030+
1031+
- do:
1032+
search:
1033+
index: test-index
1034+
body:
1035+
fields: [ _inference_fields ]
1036+
query:
1037+
match_all: { }
1038+
1039+
- match: { hits.total.value: 2 }
1040+
- match: { hits.hits.0._source.sparse_field: "" }
1041+
- match: { hits.hits.1._source.sparse_field: " " }
1042+
- not_exists: hits.hits.0._source._inference_fields
1043+
- not_exists: hits.hits.1._source._inference_fields
1044+
1045+
---
1046+
"Reindexing with empty or whitespace semantic_text skips embedding generation":
1047+
- requires:
1048+
cluster_features: "semantic_text.handle_empty_input"
1049+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1050+
1051+
- do:
1052+
index:
1053+
index: test-index
1054+
id: doc_1
1055+
body:
1056+
sparse_field: " "
1057+
refresh: true
1058+
1059+
- do:
1060+
indices.create:
1061+
index: destination-index
1062+
body:
1063+
settings:
1064+
index:
1065+
mapping:
1066+
semantic_text:
1067+
use_legacy_format: false
1068+
mappings:
1069+
properties:
1070+
sparse_field:
1071+
type: semantic_text
1072+
inference_id: sparse-inference-id
1073+
1074+
- do:
1075+
reindex:
1076+
wait_for_completion: true
1077+
body:
1078+
source:
1079+
index: test-index
1080+
dest:
1081+
index: destination-index
1082+
refresh: true
1083+
1084+
- do:
1085+
get:
1086+
index: destination-index
1087+
id: doc_1
1088+
1089+
- match: { _source.sparse_field: " " }
1090+
1091+
- do:
1092+
search:
1093+
index: destination-index
1094+
body:
1095+
fields: [ _inference_fields ]
1096+
query:
1097+
match_all: { }
1098+
1099+
- not_exists: hits.hits.0._source._inference_fields
1100+
1101+
---
1102+
"Empty Multi-Field skips embedding generation":
1103+
- requires:
1104+
cluster_features: "semantic_text.handle_empty_input"
1105+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1106+
1107+
- do:
1108+
indices.create:
1109+
index: test-multi-index
1110+
body:
1111+
settings:
1112+
index:
1113+
mapping:
1114+
semantic_text:
1115+
use_legacy_format: false
1116+
mappings:
1117+
properties:
1118+
field:
1119+
type: semantic_text
1120+
inference_id: sparse-inference-id
1121+
fields:
1122+
sparse:
1123+
type: semantic_text
1124+
inference_id: sparse-inference-id
1125+
1126+
- do:
1127+
bulk:
1128+
index: test-multi-index
1129+
refresh: true
1130+
body: |
1131+
{"index":{"_id": "1"}}
1132+
{"field": ["you know, for testing", "now with chunks"]}
1133+
{"index":{"_id": "2"}}
1134+
{"field": ["", " "]}
1135+
1136+
- do:
1137+
search:
1138+
index: test-multi-index
1139+
body:
1140+
fields: [ _inference_fields ]
1141+
query:
1142+
match_all: { }
1143+
1144+
- exists: hits.hits.0._source._inference_fields
1145+
- not_exists: hits.hits.1._source._inference_fields
1146+
1147+
---
1148+
"Multi chunks skips empty input embedding generation":
1149+
- requires:
1150+
cluster_features: "semantic_text.handle_empty_input"
1151+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1152+
1153+
- do:
1154+
index:
1155+
index: test-index
1156+
id: doc_1
1157+
body:
1158+
sparse_field: ["some test data", " ", "now with chunks"]
1159+
refresh: true
1160+
1161+
- do:
1162+
search:
1163+
index: test-index
1164+
body:
1165+
fields: [ _inference_fields ]
1166+
query:
1167+
match_all: { }
1168+
1169+
- match: { hits.total.value: 1 }
1170+
1171+
- length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
1172+
- length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 }
1173+
- exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
1174+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
1175+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
1176+
- exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings
1177+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 20 }
1178+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 35 }

0 commit comments

Comments
 (0)