Skip to content

Commit 2b20ac8

Browse files
Samiul-TheSoccerFanelasticmachineelasticsearchmachine
committed
Handle empty input inference (elastic#123763)
* Added check for blank string to skip generating embeddings with unit test * Adding yaml tests for skipping embedding generation * dynamic update not required if model_settings stays null * Updating node feature for handling empty input name and description * Update yaml tests with refresh=true * Update unit test to follow more accurate behavior * Added yaml tests for multu chunks * [CI] Auto commit changes from spotless * Adding highlighter yaml tests for empty input * Update docs/changelog/123763.yaml * Update changelog and test reason to have more polished documentation * adding input value into the response source and fixing unit tests by reformating * Adding highligher test for backward compatibility and refactor existing test * Added bwc tests for empty input and multi chunks * Removed reindex for empty input from bwc * [CI] Auto commit changes from spotless * Fixing yaml test * Update unit tests helper function to support both format * [CI] Auto commit changes from spotless * Adding cluster features for bwc * Centralize logic for assertInference helper --------- Co-authored-by: Elastic Machine <[email protected]> Co-authored-by: elasticsearchmachine <[email protected]>
1 parent a4b4ba1 commit 2b20ac8

File tree

9 files changed

+403
-12
lines changed

9 files changed

+403
-12
lines changed

docs/changelog/123763.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 123763
2+
summary: Skip semantic_text embedding generation when no content is provided.
3+
area: Relevance
4+
type: enhancement
5+
issues: []

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ public Set<NodeFeature> getTestFeatures() {
6262
SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT,
6363
SEMANTIC_KNN_FILTER_FIX,
6464
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE,
65-
SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT
65+
SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT,
66+
SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT
6667
);
6768
}
6869
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
561561
}
562562
continue;
563563
}
564-
ensureResponseAccumulatorSlot(itemIndex);
564+
var slot = ensureResponseAccumulatorSlot(itemIndex);
565565
final List<String> values;
566566
try {
567567
values = SemanticTextUtils.nodeStringValues(field, valueObj);
@@ -578,7 +578,13 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
578578
List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
579579
int offsetAdjustment = 0;
580580
for (String v : values) {
581-
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
581+
if (v.isBlank()) {
582+
slot.addOrUpdateResponse(
583+
new FieldInferenceResponse(field, sourceField, v, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
584+
);
585+
} else {
586+
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
587+
}
582588

583589
// When using the inference metadata fields format, all the input values are concatenated so that the
584590
// chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
119119
public static final NodeFeature SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX = new NodeFeature(
120120
"semantic_text.always_emit_inference_id_fix"
121121
);
122+
public static final NodeFeature SEMANTIC_TEXT_HANDLE_EMPTY_INPUT = new NodeFeature("semantic_text.handle_empty_input");
122123
public static final NodeFeature SEMANTIC_TEXT_SKIP_INFERENCE_FIELDS = new NodeFeature("semantic_text.skip_inference_fields");
123124
public static final NodeFeature SEMANTIC_TEXT_BIT_VECTOR_SUPPORT = new NodeFeature("semantic_text.bit_vector_support");
124125

@@ -405,7 +406,7 @@ void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextFiel
405406
}
406407

407408
final SemanticTextFieldMapper mapper;
408-
if (fieldType().getModelSettings() == null) {
409+
if (fieldType().getModelSettings() == null && field.inference().modelSettings() != null) {
409410
mapper = addDynamicUpdate(context, field);
410411
} else {
411412
Conflicts conflicts = new Conflicts(fullFieldName);

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java

Lines changed: 62 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ public void testExplicitNull() throws Exception {
332332
// item 3
333333
assertNull(bulkShardRequest.items()[3].getPrimaryResponse());
334334
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request());
335-
assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0);
335+
assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, null);
336336

337337
// item 4
338338
assertNull(bulkShardRequest.items()[4].getPrimaryResponse());
@@ -365,6 +365,59 @@ public void testExplicitNull() throws Exception {
365365
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
366366
}
367367

368+
@SuppressWarnings({ "unchecked", "rawtypes" })
369+
public void testHandleEmptyInput() throws Exception {
370+
StaticModel model = StaticModel.createRandomInstance();
371+
ShardBulkInferenceActionFilter filter = createFilter(
372+
threadPool,
373+
Map.of(model.getInferenceEntityId(), model),
374+
randomIntBetween(1, 10),
375+
useLegacyFormat,
376+
true
377+
);
378+
379+
CountDownLatch chainExecuted = new CountDownLatch(1);
380+
ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
381+
try {
382+
BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
383+
assertNull(bulkShardRequest.getInferenceFieldMap());
384+
assertThat(bulkShardRequest.items().length, equalTo(3));
385+
386+
// Create with Empty string
387+
assertNull(bulkShardRequest.items()[0].getPrimaryResponse());
388+
IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());
389+
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", "", 0);
390+
391+
// Create with whitespace only
392+
assertNull(bulkShardRequest.items()[1].getPrimaryResponse());
393+
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
394+
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);
395+
396+
// Update with multiple Whitespaces
397+
assertNull(bulkShardRequest.items()[2].getPrimaryResponse());
398+
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request());
399+
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);
400+
} finally {
401+
chainExecuted.countDown();
402+
}
403+
};
404+
ActionListener actionListener = mock(ActionListener.class);
405+
Task task = mock(Task.class);
406+
Map<String, InferenceFieldMetadata> inferenceFieldMap = Map.of(
407+
"semantic_text_field",
408+
new InferenceFieldMetadata("semantic_text_field", model.getInferenceEntityId(), new String[] { "semantic_text_field" })
409+
);
410+
411+
BulkItemRequest[] items = new BulkItemRequest[3];
412+
items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("semantic_text_field", "")));
413+
items[1] = new BulkItemRequest(1, new IndexRequest("index").source(Map.of("semantic_text_field", " ")));
414+
items[2] = new BulkItemRequest(2, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("semantic_text_field", " "))));
415+
BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
416+
request.setInferenceFieldMap(inferenceFieldMap);
417+
filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
418+
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
419+
}
420+
368421
@SuppressWarnings({ "unchecked", "rawtypes" })
369422
public void testManyRandomDocs() throws Exception {
370423
Map<String, StaticModel> inferenceModelMap = new HashMap<>();
@@ -585,7 +638,7 @@ private static void assertInferenceResults(
585638
IndexRequest request,
586639
String fieldName,
587640
Object expectedOriginalValue,
588-
int expectedChunkCount
641+
Integer expectedChunkCount
589642
) {
590643
final Map<String, Object> requestMap = request.sourceAsMap();
591644
if (useLegacyFormat) {
@@ -595,13 +648,11 @@ private static void assertInferenceResults(
595648
);
596649

597650
List<Object> chunks = (List<Object>) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap);
598-
if (expectedChunkCount > 0) {
651+
if (expectedChunkCount == null) {
652+
assertNull(chunks);
653+
} else {
599654
assertNotNull(chunks);
600655
assertThat(chunks.size(), equalTo(expectedChunkCount));
601-
} else {
602-
// If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be
603-
// transformed, and thus the semantic text field structure should not be created.
604-
assertNull(chunks);
605656
}
606657
} else {
607658
assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue));
@@ -621,8 +672,11 @@ private static void assertInferenceResults(
621672
inferenceMetadataFields,
622673
EXPLICIT_NULL
623674
);
675+
676+
// When using the new format, the chunks field should always exist
677+
int expectedSize = expectedChunkCount == null ? 0 : expectedChunkCount;
624678
assertNotNull(chunks);
625-
assertThat(chunks.size(), equalTo(expectedChunkCount));
679+
assertThat(chunks.size(), equalTo(expectedSize));
626680
}
627681
}
628682

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,3 +1005,174 @@ setup:
10051005
- match: { hits.hits.0._source.dense_field: "another inference test" }
10061006
- match: { hits.hits.0._source.non_inference_field: "non inference test" }
10071007
- exists: hits.hits.0._source._inference_fields
1008+
1009+
---
1010+
"Empty semantic_text field skips embedding generation":
1011+
- requires:
1012+
cluster_features: "semantic_text.handle_empty_input"
1013+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1014+
1015+
- do:
1016+
index:
1017+
index: test-index
1018+
id: doc_1
1019+
body:
1020+
sparse_field: ""
1021+
refresh: true
1022+
1023+
- do:
1024+
index:
1025+
index: test-index
1026+
id: doc_2
1027+
body:
1028+
sparse_field: " "
1029+
refresh: true
1030+
1031+
- do:
1032+
search:
1033+
index: test-index
1034+
body:
1035+
fields: [ _inference_fields ]
1036+
query:
1037+
match_all: { }
1038+
1039+
- match: { hits.total.value: 2 }
1040+
- match: { hits.hits.0._source.sparse_field: "" }
1041+
- match: { hits.hits.1._source.sparse_field: " " }
1042+
- not_exists: hits.hits.0._source._inference_fields
1043+
- not_exists: hits.hits.1._source._inference_fields
1044+
1045+
---
1046+
"Reindexing with empty or whitespace semantic_text skips embedding generation":
1047+
- requires:
1048+
cluster_features: "semantic_text.handle_empty_input"
1049+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1050+
1051+
- do:
1052+
index:
1053+
index: test-index
1054+
id: doc_1
1055+
body:
1056+
sparse_field: " "
1057+
refresh: true
1058+
1059+
- do:
1060+
indices.create:
1061+
index: destination-index
1062+
body:
1063+
settings:
1064+
index:
1065+
mapping:
1066+
semantic_text:
1067+
use_legacy_format: false
1068+
mappings:
1069+
properties:
1070+
sparse_field:
1071+
type: semantic_text
1072+
inference_id: sparse-inference-id
1073+
1074+
- do:
1075+
reindex:
1076+
wait_for_completion: true
1077+
body:
1078+
source:
1079+
index: test-index
1080+
dest:
1081+
index: destination-index
1082+
refresh: true
1083+
1084+
- do:
1085+
get:
1086+
index: destination-index
1087+
id: doc_1
1088+
1089+
- match: { _source.sparse_field: " " }
1090+
1091+
- do:
1092+
search:
1093+
index: destination-index
1094+
body:
1095+
fields: [ _inference_fields ]
1096+
query:
1097+
match_all: { }
1098+
1099+
- not_exists: hits.hits.0._source._inference_fields
1100+
1101+
---
1102+
"Empty Multi-Field skips embedding generation":
1103+
- requires:
1104+
cluster_features: "semantic_text.handle_empty_input"
1105+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1106+
1107+
- do:
1108+
indices.create:
1109+
index: test-multi-index
1110+
body:
1111+
settings:
1112+
index:
1113+
mapping:
1114+
semantic_text:
1115+
use_legacy_format: false
1116+
mappings:
1117+
properties:
1118+
field:
1119+
type: semantic_text
1120+
inference_id: sparse-inference-id
1121+
fields:
1122+
sparse:
1123+
type: semantic_text
1124+
inference_id: sparse-inference-id
1125+
1126+
- do:
1127+
bulk:
1128+
index: test-multi-index
1129+
refresh: true
1130+
body: |
1131+
{"index":{"_id": "1"}}
1132+
{"field": ["you know, for testing", "now with chunks"]}
1133+
{"index":{"_id": "2"}}
1134+
{"field": ["", " "]}
1135+
1136+
- do:
1137+
search:
1138+
index: test-multi-index
1139+
body:
1140+
fields: [ _inference_fields ]
1141+
query:
1142+
match_all: { }
1143+
1144+
- exists: hits.hits.0._source._inference_fields
1145+
- not_exists: hits.hits.1._source._inference_fields
1146+
1147+
---
1148+
"Multi chunks skips empty input embedding generation":
1149+
- requires:
1150+
cluster_features: "semantic_text.handle_empty_input"
1151+
reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
1152+
1153+
- do:
1154+
index:
1155+
index: test-index
1156+
id: doc_1
1157+
body:
1158+
sparse_field: ["some test data", " ", "now with chunks"]
1159+
refresh: true
1160+
1161+
- do:
1162+
search:
1163+
index: test-index
1164+
body:
1165+
fields: [ _inference_fields ]
1166+
query:
1167+
match_all: { }
1168+
1169+
- match: { hits.total.value: 1 }
1170+
1171+
- length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
1172+
- length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 }
1173+
- exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
1174+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
1175+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
1176+
- exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings
1177+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 20 }
1178+
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 35 }

0 commit comments

Comments
 (0)