Skip to content

Commit cabc063

Browse files
kderussosmalyshev
authored andcommitted
Support returning default index_options for semantic_text fields when include_defaults is true (elastic#129967)
1 parent 606cf66 commit cabc063

File tree

6 files changed

+394
-39
lines changed

6 files changed

+394
-39
lines changed

docs/changelog/129967.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 129967
2+
summary: Support returning default `index_options` for `semantic_text` fields when
3+
`include_defaults` is true
4+
area: Search
5+
type: bug
6+
issues: []

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS;
1919
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS;
20+
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS;
2021
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG;
2122
import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_FILTER_FIX;
2223
import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED;
@@ -66,7 +67,8 @@ public Set<NodeFeature> getTestFeatures() {
6667
SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER,
6768
SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS,
6869
SEMANTIC_TEXT_INDEX_OPTIONS,
69-
COHERE_V2_API
70+
COHERE_V2_API,
71+
SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS
7072
);
7173
}
7274
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

Lines changed: 78 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
import org.elasticsearch.inference.InferenceResults;
7070
import org.elasticsearch.inference.MinimalServiceSettings;
7171
import org.elasticsearch.inference.SimilarityMeasure;
72+
import org.elasticsearch.inference.TaskType;
7273
import org.elasticsearch.search.fetch.StoredFieldsSpec;
7374
import org.elasticsearch.search.lookup.Source;
7475
import org.elasticsearch.search.vectors.KnnVectorQueryBuilder;
@@ -139,6 +140,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
139140
"semantic_text.exclude_sub_fields_from_field_caps"
140141
);
141142
public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS = new NodeFeature("semantic_text.index_options");
143+
public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS = new NodeFeature(
144+
"semantic_text.index_options_with_defaults"
145+
);
142146

143147
public static final String CONTENT_TYPE = "semantic_text";
144148
public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID;
@@ -166,19 +170,9 @@ public static BiConsumer<String, MappingParserContext> validateParserContext(Str
166170
public static class Builder extends FieldMapper.Builder {
167171
private final ModelRegistry modelRegistry;
168172
private final boolean useLegacyFormat;
173+
private final IndexVersion indexVersionCreated;
169174

170-
private final Parameter<String> inferenceId = Parameter.stringParam(
171-
INFERENCE_ID_FIELD,
172-
false,
173-
mapper -> ((SemanticTextFieldType) mapper.fieldType()).inferenceId,
174-
DEFAULT_ELSER_2_INFERENCE_ID
175-
).addValidator(v -> {
176-
if (Strings.isEmpty(v)) {
177-
throw new IllegalArgumentException(
178-
"[" + INFERENCE_ID_FIELD + "] on mapper [" + leafName() + "] of type [" + CONTENT_TYPE + "] must not be empty"
179-
);
180-
}
181-
}).alwaysSerialize();
175+
private final Parameter<String> inferenceId;
182176

183177
private final Parameter<String> searchInferenceId = Parameter.stringParam(
184178
SEARCH_INFERENCE_ID_FIELD,
@@ -193,25 +187,9 @@ public static class Builder extends FieldMapper.Builder {
193187
}
194188
});
195189

196-
private final Parameter<MinimalServiceSettings> modelSettings = new Parameter<>(
197-
MODEL_SETTINGS_FIELD,
198-
true,
199-
() -> null,
200-
(n, c, o) -> SemanticTextField.parseModelSettingsFromMap(o),
201-
mapper -> ((SemanticTextFieldType) mapper.fieldType()).modelSettings,
202-
XContentBuilder::field,
203-
Objects::toString
204-
).acceptsNull().setMergeValidator(SemanticTextFieldMapper::canMergeModelSettings);
190+
private final Parameter<MinimalServiceSettings> modelSettings;
205191

206-
private final Parameter<SemanticTextIndexOptions> indexOptions = new Parameter<>(
207-
INDEX_OPTIONS_FIELD,
208-
true,
209-
() -> null,
210-
(n, c, o) -> parseIndexOptionsFromMap(n, o, c.indexVersionCreated()),
211-
mapper -> ((SemanticTextFieldType) mapper.fieldType()).indexOptions,
212-
XContentBuilder::field,
213-
Objects::toString
214-
).acceptsNull();
192+
private final Parameter<SemanticTextIndexOptions> indexOptions;
215193

216194
@SuppressWarnings("unchecked")
217195
private final Parameter<ChunkingSettings> chunkingSettings = new Parameter<>(
@@ -248,6 +226,50 @@ public Builder(
248226
super(name);
249227
this.modelRegistry = modelRegistry;
250228
this.useLegacyFormat = InferenceMetadataFieldsMapper.isEnabled(indexSettings.getSettings()) == false;
229+
this.indexVersionCreated = indexSettings.getIndexVersionCreated();
230+
231+
this.inferenceId = Parameter.stringParam(
232+
INFERENCE_ID_FIELD,
233+
false,
234+
mapper -> ((SemanticTextFieldType) mapper.fieldType()).inferenceId,
235+
DEFAULT_ELSER_2_INFERENCE_ID
236+
).addValidator(v -> {
237+
if (Strings.isEmpty(v)) {
238+
throw new IllegalArgumentException(
239+
"[" + INFERENCE_ID_FIELD + "] on mapper [" + leafName() + "] of type [" + CONTENT_TYPE + "] must not be empty"
240+
);
241+
}
242+
}).alwaysSerialize();
243+
244+
this.modelSettings = new Parameter<>(
245+
MODEL_SETTINGS_FIELD,
246+
true,
247+
() -> null,
248+
(n, c, o) -> SemanticTextField.parseModelSettingsFromMap(o),
249+
mapper -> ((SemanticTextFieldType) mapper.fieldType()).modelSettings,
250+
XContentBuilder::field,
251+
Objects::toString
252+
).acceptsNull().setMergeValidator(SemanticTextFieldMapper::canMergeModelSettings);
253+
254+
this.indexOptions = new Parameter<>(
255+
INDEX_OPTIONS_FIELD,
256+
true,
257+
() -> null,
258+
(n, c, o) -> parseIndexOptionsFromMap(n, o, c.indexVersionCreated()),
259+
mapper -> ((SemanticTextFieldType) mapper.fieldType()).indexOptions,
260+
(b, n, v) -> {
261+
if (v == null) {
262+
MinimalServiceSettings resolvedModelSettings = modelSettings.get() != null
263+
? modelSettings.get()
264+
: modelRegistry.getMinimalServiceSettings(inferenceId.get());
265+
b.field(INDEX_OPTIONS_FIELD, defaultIndexOptions(indexVersionCreated, resolvedModelSettings));
266+
} else {
267+
b.field(INDEX_OPTIONS_FIELD, v);
268+
}
269+
},
270+
Objects::toString
271+
).acceptsNull();
272+
251273
this.inferenceFieldBuilder = c -> {
252274
// Resolve the model setting from the registry if it has not been set yet.
253275
var resolvedModelSettings = modelSettings.get() != null ? modelSettings.get() : getResolvedModelSettings(c, false);
@@ -365,8 +387,11 @@ public SemanticTextFieldMapper build(MapperBuilderContext context) {
365387
validateServiceSettings(modelSettings.get(), resolvedModelSettings);
366388
}
367389

368-
if (context.getMergeReason() != MapperService.MergeReason.MAPPING_RECOVERY && indexOptions.get() != null) {
369-
validateIndexOptions(indexOptions.get(), inferenceId.getValue(), resolvedModelSettings);
390+
// If index_options are specified by the user, we will validate them against the model settings to ensure compatibility.
391+
// We do not serialize or otherwise store model settings at this time, this happens when the underlying vector field is created.
392+
SemanticTextIndexOptions builderIndexOptions = indexOptions.get();
393+
if (context.getMergeReason() != MapperService.MergeReason.MAPPING_RECOVERY && builderIndexOptions != null) {
394+
validateIndexOptions(builderIndexOptions, inferenceId.getValue(), resolvedModelSettings);
370395
}
371396

372397
final String fullName = context.buildFullName(leafName());
@@ -1166,6 +1191,9 @@ private static Mapper.Builder createEmbeddingsField(
11661191
}
11671192
denseVectorMapperBuilder.dimensions(modelSettings.dimensions());
11681193
denseVectorMapperBuilder.elementType(modelSettings.elementType());
1194+
// Here is where we persist index_options. If they are specified by the user, we will use those index_options,
1195+
// otherwise we will determine if we can set default index options. If we can't, we won't persist any index_options
1196+
// and the field will use the defaults for the dense_vector field.
11691197
if (indexOptions != null) {
11701198
DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions =
11711199
(DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions();
@@ -1208,7 +1236,6 @@ static DenseVectorFieldMapper.DenseVectorIndexOptions defaultDenseVectorIndexOpt
12081236
// As embedding models for text perform better with BBQ, we aggressively default semantic_text fields to use optimized index
12091237
// options
12101238
if (indexVersionDefaultsToBbqHnsw(indexVersionCreated)) {
1211-
12121239
DenseVectorFieldMapper.DenseVectorIndexOptions defaultBbqHnswIndexOptions = defaultBbqHnswDenseVectorIndexOptions();
12131240
return defaultBbqHnswIndexOptions.validate(modelSettings.elementType(), modelSettings.dimensions(), false)
12141241
? defaultBbqHnswIndexOptions
@@ -1230,11 +1257,24 @@ static DenseVectorFieldMapper.DenseVectorIndexOptions defaultBbqHnswDenseVectorI
12301257
return new DenseVectorFieldMapper.BBQHnswIndexOptions(m, efConstruction, rescoreVector);
12311258
}
12321259

1233-
static SemanticTextIndexOptions defaultBbqHnswSemanticTextIndexOptions() {
1234-
return new SemanticTextIndexOptions(
1235-
SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR,
1236-
defaultBbqHnswDenseVectorIndexOptions()
1237-
);
1260+
static SemanticTextIndexOptions defaultIndexOptions(IndexVersion indexVersionCreated, MinimalServiceSettings modelSettings) {
1261+
1262+
if (modelSettings == null) {
1263+
return null;
1264+
}
1265+
1266+
SemanticTextIndexOptions defaultIndexOptions = null;
1267+
if (modelSettings.taskType() == TaskType.TEXT_EMBEDDING) {
1268+
DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions = defaultDenseVectorIndexOptions(
1269+
indexVersionCreated,
1270+
modelSettings
1271+
);
1272+
defaultIndexOptions = denseVectorIndexOptions == null
1273+
? null
1274+
: new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR, denseVectorIndexOptions);
1275+
}
1276+
1277+
return defaultIndexOptions;
12381278
}
12391279

12401280
private static boolean canMergeModelSettings(MinimalServiceSettings previous, MinimalServiceSettings current, Conflicts conflicts) {

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextIndexOptions.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.util.Arrays;
2121
import java.util.Locale;
2222
import java.util.Map;
23+
import java.util.Objects;
2324

2425
/**
2526
* Represents index options for a semantic_text field.
@@ -50,6 +51,25 @@ public IndexOptions indexOptions() {
5051
return indexOptions;
5152
}
5253

54+
@Override
55+
public boolean equals(Object other) {
56+
if (other == this) {
57+
return true;
58+
}
59+
60+
if (other == null || getClass() != other.getClass()) {
61+
return false;
62+
}
63+
64+
SemanticTextIndexOptions otherSemanticTextIndexOptions = (SemanticTextIndexOptions) other;
65+
return type == otherSemanticTextIndexOptions.type && Objects.equals(indexOptions, otherSemanticTextIndexOptions.indexOptions);
66+
}
67+
68+
@Override
69+
public int hashCode() {
70+
return Objects.hash(type, indexOptions);
71+
}
72+
5373
public enum SupportedIndexOptions {
5474
DENSE_VECTOR("dense_vector") {
5575
@Override

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,3 +833,147 @@ setup:
833833
type: int8_flat
834834

835835
- match: { status: 400 }
836+
837+
838+
---
839+
"Displaying default index_options with and without include_defaults":
840+
- requires:
841+
cluster_features: "semantic_text.index_options_with_defaults"
842+
reason: Index options defaults support introduced in 9.2.0
843+
844+
# Semantic text defaults to BBQ HNSW starting in 8.19.0/9.1.0
845+
- do:
846+
indices.create:
847+
index: test-index-options-dense
848+
body:
849+
settings:
850+
index:
851+
mapping:
852+
semantic_text:
853+
use_legacy_format: false
854+
mappings:
855+
properties:
856+
semantic_field:
857+
type: semantic_text
858+
inference_id: dense-inference-id-compatible-with-bbq
859+
860+
- do:
861+
indices.get_mapping:
862+
index: test-index-options-dense
863+
864+
- not_exists: test-index-options-dense.mappings.properties.semantic_field.index_options
865+
866+
- do:
867+
indices.get_field_mapping:
868+
index: test-index-options-dense
869+
fields: semantic_field
870+
include_defaults: true
871+
872+
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
873+
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 }
874+
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 }
875+
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
876+
877+
# Validate that actually specifying the same values as our defaults will still serialize the user provided index_options
878+
- do:
879+
indices.create:
880+
index: test-index-options-dense2
881+
body:
882+
settings:
883+
index:
884+
mapping:
885+
semantic_text:
886+
use_legacy_format: false
887+
mappings:
888+
properties:
889+
semantic_field:
890+
type: semantic_text
891+
inference_id: dense-inference-id-compatible-with-bbq
892+
index_options:
893+
dense_vector:
894+
type: bbq_hnsw
895+
m: 16
896+
ef_construction: 100
897+
rescore_vector:
898+
oversample: 3
899+
900+
- do:
901+
indices.get_mapping:
902+
index: test-index-options-dense2
903+
904+
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
905+
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.m": 16 }
906+
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.ef_construction": 100 }
907+
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
908+
909+
- do:
910+
indices.get_field_mapping:
911+
index: test-index-options-dense2
912+
fields: semantic_field
913+
include_defaults: true
914+
915+
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
916+
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 }
917+
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 }
918+
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
919+
920+
# Indices not compatible with BBQ for whatever reason will fall back to whatever `dense_vector` defaults are.
921+
- do:
922+
indices.create:
923+
index: test-index-options-dense-no-bbq
924+
body:
925+
settings:
926+
index:
927+
mapping:
928+
semantic_text:
929+
use_legacy_format: false
930+
mappings:
931+
properties:
932+
semantic_field:
933+
type: semantic_text
934+
inference_id: dense-inference-id
935+
936+
- do:
937+
indices.get_mapping:
938+
index: test-index-options-dense-no-bbq
939+
940+
- not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options
941+
942+
- do:
943+
indices.get_field_mapping:
944+
index: test-index-options-dense-no-bbq
945+
fields: semantic_field
946+
include_defaults: true
947+
948+
- not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options
949+
950+
# Sparse embeddings models do not have index options for semantic_text in 8.19/9.1.
951+
- do:
952+
indices.create:
953+
index: test-index-options-sparse
954+
body:
955+
settings:
956+
index:
957+
mapping:
958+
semantic_text:
959+
use_legacy_format: false
960+
mappings:
961+
properties:
962+
semantic_field:
963+
type: semantic_text
964+
inference_id: sparse-inference-id
965+
966+
- do:
967+
indices.get_mapping:
968+
index: test-index-options-sparse
969+
970+
- not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options
971+
972+
- do:
973+
indices.get_field_mapping:
974+
index: test-index-options-sparse
975+
fields: semantic_field
976+
include_defaults: true
977+
978+
- not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options
979+

0 commit comments

Comments
 (0)