Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9d6a32e
Added check for blank string to skip generating embeddings with unit …
Samiul-TheSoccerFan Feb 28, 2025
16f0b5a
Adding yaml tests for skipping embedding generation
Samiul-TheSoccerFan Feb 28, 2025
96605bb
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 3, 2025
6403aa0
dynamic update not required if model_settings stays null
Samiul-TheSoccerFan Mar 5, 2025
6e0d484
Updating node feature for handling empty input name and description
Samiul-TheSoccerFan Mar 5, 2025
aeaf117
Update yaml tests with refresh=true
Samiul-TheSoccerFan Mar 5, 2025
bb99b3b
Update unit test to follow more accurate behavior
Samiul-TheSoccerFan Mar 5, 2025
6509870
Added yaml tests for multu chunks
Samiul-TheSoccerFan Mar 5, 2025
3c4c3ed
[CI] Auto commit changes from spotless
Mar 5, 2025
f085df3
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 5, 2025
f7d9359
Adding highlighter yaml tests for empty input
Samiul-TheSoccerFan Mar 5, 2025
285226a
Update docs/changelog/123763.yaml
Samiul-TheSoccerFan Mar 5, 2025
43406db
Update changelog and test reason to have more polished documentation
Samiul-TheSoccerFan Mar 6, 2025
78c5e12
adding input value into the response source and fixing unit tests by …
Samiul-TheSoccerFan Mar 6, 2025
33a533a
Adding highligher test for backward compatibility and refactor existi…
Samiul-TheSoccerFan Mar 6, 2025
cd15c9e
Added bwc tests for empty input and multi chunks
Samiul-TheSoccerFan Mar 7, 2025
2fb0092
Removed reindex for empty input from bwc
Samiul-TheSoccerFan Mar 7, 2025
1a275db
[CI] Auto commit changes from spotless
Mar 7, 2025
7486fe8
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 7, 2025
6123d1a
Fixing yaml test
Samiul-TheSoccerFan Mar 7, 2025
d31d281
Update unit tests helper function to support both format
Samiul-TheSoccerFan Mar 7, 2025
78a390c
[CI] Auto commit changes from spotless
Mar 7, 2025
09a298a
Adding cluster features for bwc
Samiul-TheSoccerFan Mar 7, 2025
72886bf
Centralize logic for assertInference helper
Samiul-TheSoccerFan Mar 7, 2025
1179f84
resolve conflicts from main
Samiul-TheSoccerFan Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/123763.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 123763
summary: Handle empty input inference
area: Relevance
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ public Set<NodeFeature> getTestFeatures() {
SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT,
SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT,
SEMANTIC_KNN_FILTER_FIX,
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE,
SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
}
continue;
}
ensureResponseAccumulatorSlot(itemIndex);
var slot = ensureResponseAccumulatorSlot(itemIndex);
final List<String> values;
try {
values = SemanticTextUtils.nodeStringValues(field, valueObj);
Expand All @@ -580,7 +580,13 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
int offsetAdjustment = 0;
for (String v : values) {
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
if (v.isBlank()) {
slot.addOrUpdateResponse(
new FieldInferenceResponse(field, sourceField, null, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
);
} else {
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
}

// When using the inference metadata fields format, all the input values are concatenated so that the
// chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
public static final NodeFeature SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX = new NodeFeature(
"semantic_text.always_emit_inference_id_fix"
);
public static final NodeFeature SEMANTIC_TEXT_HANDLE_EMPTY_INPUT = new NodeFeature("semantic_text.handle_empty_input");
public static final NodeFeature SEMANTIC_TEXT_SKIP_INFERENCE_FIELDS = new NodeFeature("semantic_text.skip_inference_fields");

public static final String CONTENT_TYPE = "semantic_text";
Expand Down Expand Up @@ -402,7 +403,7 @@ void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextFiel
}

final SemanticTextFieldMapper mapper;
if (fieldType().getModelSettings() == null) {
if (fieldType().getModelSettings() == null && field.inference().modelSettings() != null) {
mapper = addDynamicUpdate(context, field);
} else {
Conflicts conflicts = new Conflicts(fullFieldName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,54 @@ public void testExplicitNull() throws Exception {
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
}

@SuppressWarnings({ "unchecked", "rawtypes" })
public void testHandleEmptyInput() throws Exception {
StaticModel model = StaticModel.createRandomInstance();
ShardBulkInferenceActionFilter filter = createFilter(
threadPool,
Map.of(model.getInferenceEntityId(), model),
randomIntBetween(1, 10),
useLegacyFormat,
true
);

CountDownLatch chainExecuted = new CountDownLatch(1);
ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
try {
BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());

// Create with Empty string
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", useLegacyFormat ? EXPLICIT_NULL : "", 0);

// Create with whitespace only
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", useLegacyFormat ? EXPLICIT_NULL : " ", 0);

// Update with multiple Whitespaces
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request());
assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", useLegacyFormat ? EXPLICIT_NULL : " ", 0);
} finally {
chainExecuted.countDown();
}
};
ActionListener actionListener = mock(ActionListener.class);
Task task = mock(Task.class);
Map<String, InferenceFieldMetadata> inferenceFieldMap = Map.of(
"semantic_text_field",
new InferenceFieldMetadata("semantic_text_field", model.getInferenceEntityId(), new String[] { "semantic_text_field" })
);

BulkItemRequest[] items = new BulkItemRequest[3];
items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("semantic_text_field", "")));
items[1] = new BulkItemRequest(1, new IndexRequest("index").source(Map.of("semantic_text_field", " ")));
items[2] = new BulkItemRequest(2, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("semantic_text_field", " "))));
BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
request.setInferenceFieldMap(inferenceFieldMap);
filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
}

@SuppressWarnings({ "unchecked", "rawtypes" })
public void testManyRandomDocs() throws Exception {
Map<String, StaticModel> inferenceModelMap = new HashMap<>();
Expand Down Expand Up @@ -603,9 +651,8 @@ private static void assertInferenceResults(
assertNotNull(chunks);
assertThat(chunks.size(), equalTo(expectedChunkCount));
} else {
// If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be
// transformed, and thus the semantic text field structure should not be created.
assertNull(chunks);
// If the expected chunk count is 0, we expect that no inference has been performed.
assertTrue(chunks == null || chunks.isEmpty());
}
} else {
assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1005,3 +1005,190 @@ setup:
- match: { hits.hits.0._source.dense_field: "another inference test" }
- match: { hits.hits.0._source.non_inference_field: "non inference test" }
- exists: hits.hits.0._source._inference_fields

---
"Empty semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: We usually put the reason as when the fix was introduced e.g. 8.19.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we only mention 8.19 or we should mention 9.1.0 as well?

Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.. How about this one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's perfect!


- do:
index:
index: test-index
id: doc_1
body:
sparse_field: ""
refresh: true

- do:
search:
index: test-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- match: { hits.total.value: 1 }
- match: { hits.hits.0._source.sparse_field: "" }
- not_exists: hits.hits.0._source._inference_fields

---
"Whitespace-Only semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: " "
refresh: true

- do:
search:
index: test-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- match: { hits.total.value: 1 }
- match: { hits.hits.0._source.sparse_field: " " }
- not_exists: hits.hits.0._source._inference_fields

---
"Reindexing with empty or whitespace semantic_text skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: " "
refresh: true

- do:
indices.create:
index: destination-index
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
sparse_field:
type: semantic_text
inference_id: sparse-inference-id

- do:
reindex:
wait_for_completion: true
body:
source:
index: test-index
dest:
index: destination-index
refresh: true

- do:
get:
index: destination-index
id: doc_1

- match: { _source.sparse_field: " " }

- do:
search:
index: destination-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- not_exists: hits.hits.0._source._inference_fields

---
"Empty Multi-Field skips embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input

- do:
indices.create:
index: test-multi-index
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
field:
type: semantic_text
inference_id: sparse-inference-id
fields:
sparse:
type: semantic_text
inference_id: sparse-inference-id

- do:
bulk:
index: test-multi-index
refresh: true
body: |
{"index":{"_id": "1"}}
{"field": ["you know, for testing", "now with chunks"]}
{"index":{"_id": "2"}}
{"field": ["", " "]}

- do:
search:
index: test-multi-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- exists: hits.hits.0._source._inference_fields
- not_exists: hits.hits.1._source._inference_fields

---
"Multi chunks skips empty input embedding generation":
- requires:
cluster_features: "semantic_text.handle_empty_input"
reason: skips generating embeddings when semantic_text field is contains empty or whitespace only input

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: ["some test data", " ", "now with chunks"]
refresh: true

- do:
search:
index: test-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- match: { hits.total.value: 1 }

- length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
- length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 }
- exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
- exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 20 }
- match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 35 }
Loading