Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9d6a32e
Added check for blank string to skip generating embeddings with unit …
Samiul-TheSoccerFan Feb 28, 2025
16f0b5a
Adding yaml tests for skipping embedding generation
Samiul-TheSoccerFan Feb 28, 2025
96605bb
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 3, 2025
6403aa0
dynamic update not required if model_settings stays null
Samiul-TheSoccerFan Mar 5, 2025
6e0d484
Updating node feature for handling empty input name and description
Samiul-TheSoccerFan Mar 5, 2025
aeaf117
Update yaml tests with refresh=true
Samiul-TheSoccerFan Mar 5, 2025
bb99b3b
Update unit test to follow more accurate behavior
Samiul-TheSoccerFan Mar 5, 2025
6509870
Added yaml tests for multu chunks
Samiul-TheSoccerFan Mar 5, 2025
3c4c3ed
[CI] Auto commit changes from spotless
Mar 5, 2025
f085df3
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 5, 2025
f7d9359
Adding highlighter yaml tests for empty input
Samiul-TheSoccerFan Mar 5, 2025
285226a
Update docs/changelog/123763.yaml
Samiul-TheSoccerFan Mar 5, 2025
43406db
Update changelog and test reason to have more polished documentation
Samiul-TheSoccerFan Mar 6, 2025
78c5e12
adding input value into the response source and fixing unit tests by …
Samiul-TheSoccerFan Mar 6, 2025
33a533a
Adding highligher test for backward compatibility and refactor existi…
Samiul-TheSoccerFan Mar 6, 2025
cd15c9e
Added bwc tests for empty input and multi chunks
Samiul-TheSoccerFan Mar 7, 2025
2fb0092
Removed reindex for empty input from bwc
Samiul-TheSoccerFan Mar 7, 2025
1a275db
[CI] Auto commit changes from spotless
Mar 7, 2025
7486fe8
Merge branch 'main' into handle-empty-input-inference
elasticmachine Mar 7, 2025
6123d1a
Fixing yaml test
Samiul-TheSoccerFan Mar 7, 2025
d31d281
Update unit tests helper function to support both format
Samiul-TheSoccerFan Mar 7, 2025
78a390c
[CI] Auto commit changes from spotless
Mar 7, 2025
09a298a
Adding cluster features for bwc
Samiul-TheSoccerFan Mar 7, 2025
72886bf
Centralize logic for assertInference helper
Samiul-TheSoccerFan Mar 7, 2025
1179f84
resolve conflicts from main
Samiul-TheSoccerFan Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ public Set<NodeFeature> getTestFeatures() {
SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT,
SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT,
SEMANTIC_KNN_FILTER_FIX,
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE
TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE,
SemanticTextFieldMapper.SEMANTIC_TEXT_SKIPS_GENERATING_EMBEDDINGS_FIX
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
}
continue;
}
ensureResponseAccumulatorSlot(itemIndex);
var slot = ensureResponseAccumulatorSlot(itemIndex);
final List<String> values;
try {
values = SemanticTextUtils.nodeStringValues(field, valueObj);
Expand All @@ -580,7 +580,13 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
int offsetAdjustment = 0;
for (String v : values) {
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
if (v.isBlank()) {
slot.addOrUpdateResponse(
new FieldInferenceResponse(field, sourceField, null, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
);
} else {
fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
}

// When using the inference metadata fields format, all the input values are concatenated so that the
// chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
public static final NodeFeature SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX = new NodeFeature(
"semantic_text.always_emit_inference_id_fix"
);
public static final NodeFeature SEMANTIC_TEXT_SKIPS_GENERATING_EMBEDDINGS_FIX = new NodeFeature(
"semantic_text.skips_generating_embeddings_fix"
);
public static final NodeFeature SEMANTIC_TEXT_SKIP_INFERENCE_FIELDS = new NodeFeature("semantic_text.skip_inference_fields");

public static final String CONTENT_TYPE = "semantic_text";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,58 @@ public void testExplicitNull() throws Exception {
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
}

@SuppressWarnings({ "unchecked", "rawtypes" })
public void testSkipGeneratingInference() throws Exception {
StaticModel model = StaticModel.createRandomInstance();
ShardBulkInferenceActionFilter filter = createFilter(
threadPool,
Map.of(model.getInferenceEntityId(), model),
randomIntBetween(1, 10),
useLegacyFormat,
true
);

CountDownLatch chainExecuted = new CountDownLatch(1);
ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
try {
BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());

// Create: Empty string
assertThat(XContentMapValues.extractValue("obj", actualRequest.sourceAsMap(), EXPLICIT_NULL), equalTo(""));
assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL));

// Create: whitespace only
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
assertThat(XContentMapValues.extractValue("obj.field", actualRequest.sourceAsMap(), EXPLICIT_NULL), equalTo(""));
assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL));

// Update: Empty string
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request());
assertThat(XContentMapValues.extractValue("obj", actualRequest.sourceAsMap(), EXPLICIT_NULL), equalTo(" "));
assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL));

// Update: whitespace only
actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request());
assertThat(XContentMapValues.extractValue("obj.field", actualRequest.sourceAsMap(), EXPLICIT_NULL), equalTo(" "));
assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL));
} finally {
chainExecuted.countDown();
}
};
ActionListener actionListener = mock(ActionListener.class);
Task task = mock(Task.class);

BulkItemRequest[] items = new BulkItemRequest[4];
items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("obj", "")));
items[1] = new BulkItemRequest(1, new IndexRequest("index").source(Map.of("obj", Map.of("field", ""))));
items[2] = new BulkItemRequest(2, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("obj", " "))));
items[3] = new BulkItemRequest(3, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("obj", Map.of("field", " ")))));
BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
}

@SuppressWarnings({ "unchecked", "rawtypes" })
public void testManyRandomDocs() throws Exception {
Map<String, StaticModel> inferenceModelMap = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1005,3 +1005,153 @@ setup:
- match: { hits.hits.0._source.dense_field: "another inference test" }
- match: { hits.hits.0._source.non_inference_field: "non inference test" }
- exists: hits.hits.0._source._inference_fields

---
"Empty semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.skips_generating_embeddings_fix"
reason: skips generating embeddings when field value is whitespace and/or empty string

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: ""

- do:
search:
index: test-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- match: { hits.hits.0._source.sparse_field: "" }
- not_exists: hits.hits.0._source._inference_fields

---
"Whitespace-Only semantic_text field skips embedding generation":
- requires:
cluster_features: "semantic_text.skips_generating_embeddings_fix"
reason: skips generating embeddings when field value is whitespace and/or empty string

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: " "

- do:
search:
index: test-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- match: { hits.hits.0._source.sparse_field: " " }
- not_exists: hits.hits.0._source._inference_fields

---
"Reindexing with empty or whitespace semantic_text skips embedding generation":
- requires:
cluster_features: "semantic_text.skips_generating_embeddings_fix"
reason: skips generating embeddings when field value is whitespace and/or empty string

- do:
index:
index: test-index
id: doc_1
body:
sparse_field: " "
refresh: true

- do:
indices.create:
index: destination-index
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
sparse_field:
type: semantic_text
inference_id: sparse-inference-id

- do:
reindex:
wait_for_completion: true
body:
source:
index: test-index
dest:
index: destination-index
refresh: true

- do:
get:
index: destination-index
id: doc_1

- match: { _source.sparse_field: " " }

- do:
search:
index: destination-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- not_exists: hits.hits.0._source._inference_fields

---
"Empty Multi-Field skips embedding generation":
- requires:
cluster_features: "semantic_text.skips_generating_embeddings_fix"
reason: skips generating embeddings when field value is whitespace and/or empty string

- do:
indices.create:
index: test-multi-index
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
field:
type: semantic_text
inference_id: sparse-inference-id
fields:
sparse:
type: semantic_text
inference_id: sparse-inference-id

- do:
bulk:
index: test-multi-index
refresh: true
body: |
{"index":{"_id": "1"}}
{"field": ["you know, for testing", "now with chunks"]}
{"index":{"_id": "2"}}
{"field": ["", " "]}

- do:
search:
index: test-multi-index
body:
fields: [ _inference_fields ]
query:
match_all: { }

- exists: hits.hits.0._source._inference_fields
- not_exists: hits.hits.1._source._inference_fields