diff --git a/docs/changelog/129967.yaml b/docs/changelog/129967.yaml new file mode 100644 index 0000000000000..8a7ea868aaebb --- /dev/null +++ b/docs/changelog/129967.yaml @@ -0,0 +1,6 @@ +pr: 129967 +summary: Support returning default `index_options` for `semantic_text` fields when + `include_defaults` is true +area: Search +type: bug +issues: [] diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java index 2bc481cc484d6..3d05600709b23 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java @@ -17,6 +17,7 @@ import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS; import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS; import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG; import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_FILTER_FIX; import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED; @@ -66,7 +67,8 @@ public Set getTestFeatures() { SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER, SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS, SEMANTIC_TEXT_INDEX_OPTIONS, - COHERE_V2_API + COHERE_V2_API, + SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS ); } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index 5400bf6acc673..fd5f1ce2735a9 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -69,6 +69,7 @@ import org.elasticsearch.inference.InferenceResults; import org.elasticsearch.inference.MinimalServiceSettings; import org.elasticsearch.inference.SimilarityMeasure; +import org.elasticsearch.inference.TaskType; import org.elasticsearch.search.fetch.StoredFieldsSpec; import org.elasticsearch.search.lookup.Source; import org.elasticsearch.search.vectors.KnnVectorQueryBuilder; @@ -139,6 +140,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie "semantic_text.exclude_sub_fields_from_field_caps" ); public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS = new NodeFeature("semantic_text.index_options"); + public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS = new NodeFeature( + "semantic_text.index_options_with_defaults" + ); public static final String CONTENT_TYPE = "semantic_text"; public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID; @@ -166,19 +170,9 @@ public static BiConsumer validateParserContext(Str public static class Builder extends FieldMapper.Builder { private final ModelRegistry modelRegistry; private final boolean useLegacyFormat; + private final IndexVersion indexVersionCreated; - private final Parameter inferenceId = Parameter.stringParam( - INFERENCE_ID_FIELD, - false, - mapper -> ((SemanticTextFieldType) mapper.fieldType()).inferenceId, - DEFAULT_ELSER_2_INFERENCE_ID - ).addValidator(v -> { - if (Strings.isEmpty(v)) { - throw new IllegalArgumentException( - "[" + INFERENCE_ID_FIELD + "] on mapper [" + leafName() + "] of type [" + CONTENT_TYPE + "] must not be empty" - ); - } - }).alwaysSerialize(); + private final Parameter inferenceId; private final Parameter searchInferenceId = Parameter.stringParam( SEARCH_INFERENCE_ID_FIELD, @@ -193,25 +187,9 @@ public static class Builder extends FieldMapper.Builder { } }); - private final Parameter modelSettings = new Parameter<>( - MODEL_SETTINGS_FIELD, - true, - () -> null, - (n, c, o) -> SemanticTextField.parseModelSettingsFromMap(o), - mapper -> ((SemanticTextFieldType) mapper.fieldType()).modelSettings, - XContentBuilder::field, - Objects::toString - ).acceptsNull().setMergeValidator(SemanticTextFieldMapper::canMergeModelSettings); + private final Parameter modelSettings; - private final Parameter indexOptions = new Parameter<>( - INDEX_OPTIONS_FIELD, - true, - () -> null, - (n, c, o) -> parseIndexOptionsFromMap(n, o, c.indexVersionCreated()), - mapper -> ((SemanticTextFieldType) mapper.fieldType()).indexOptions, - XContentBuilder::field, - Objects::toString - ).acceptsNull(); + private final Parameter indexOptions; @SuppressWarnings("unchecked") private final Parameter chunkingSettings = new Parameter<>( @@ -248,6 +226,50 @@ public Builder( super(name); this.modelRegistry = modelRegistry; this.useLegacyFormat = InferenceMetadataFieldsMapper.isEnabled(indexSettings.getSettings()) == false; + this.indexVersionCreated = indexSettings.getIndexVersionCreated(); + + this.inferenceId = Parameter.stringParam( + INFERENCE_ID_FIELD, + false, + mapper -> ((SemanticTextFieldType) mapper.fieldType()).inferenceId, + DEFAULT_ELSER_2_INFERENCE_ID + ).addValidator(v -> { + if (Strings.isEmpty(v)) { + throw new IllegalArgumentException( + "[" + INFERENCE_ID_FIELD + "] on mapper [" + leafName() + "] of type [" + CONTENT_TYPE + "] must not be empty" + ); + } + }).alwaysSerialize(); + + this.modelSettings = new Parameter<>( + MODEL_SETTINGS_FIELD, + true, + () -> null, + (n, c, o) -> SemanticTextField.parseModelSettingsFromMap(o), + mapper -> ((SemanticTextFieldType) mapper.fieldType()).modelSettings, + XContentBuilder::field, + Objects::toString + ).acceptsNull().setMergeValidator(SemanticTextFieldMapper::canMergeModelSettings); + + this.indexOptions = new Parameter<>( + INDEX_OPTIONS_FIELD, + true, + () -> null, + (n, c, o) -> parseIndexOptionsFromMap(n, o, c.indexVersionCreated()), + mapper -> ((SemanticTextFieldType) mapper.fieldType()).indexOptions, + (b, n, v) -> { + if (v == null) { + MinimalServiceSettings resolvedModelSettings = modelSettings.get() != null + ? modelSettings.get() + : modelRegistry.getMinimalServiceSettings(inferenceId.get()); + b.field(INDEX_OPTIONS_FIELD, defaultIndexOptions(indexVersionCreated, resolvedModelSettings)); + } else { + b.field(INDEX_OPTIONS_FIELD, v); + } + }, + Objects::toString + ).acceptsNull(); + this.inferenceFieldBuilder = c -> { // Resolve the model setting from the registry if it has not been set yet. var resolvedModelSettings = modelSettings.get() != null ? modelSettings.get() : getResolvedModelSettings(c, false); @@ -365,8 +387,11 @@ public SemanticTextFieldMapper build(MapperBuilderContext context) { validateServiceSettings(modelSettings.get(), resolvedModelSettings); } - if (context.getMergeReason() != MapperService.MergeReason.MAPPING_RECOVERY && indexOptions.get() != null) { - validateIndexOptions(indexOptions.get(), inferenceId.getValue(), resolvedModelSettings); + // If index_options are specified by the user, we will validate them against the model settings to ensure compatibility. + // We do not serialize or otherwise store model settings at this time, this happens when the underlying vector field is created. + SemanticTextIndexOptions builderIndexOptions = indexOptions.get(); + if (context.getMergeReason() != MapperService.MergeReason.MAPPING_RECOVERY && builderIndexOptions != null) { + validateIndexOptions(builderIndexOptions, inferenceId.getValue(), resolvedModelSettings); } final String fullName = context.buildFullName(leafName()); @@ -1166,6 +1191,9 @@ private static Mapper.Builder createEmbeddingsField( } denseVectorMapperBuilder.dimensions(modelSettings.dimensions()); denseVectorMapperBuilder.elementType(modelSettings.elementType()); + // Here is where we persist index_options. If they are specified by the user, we will use those index_options, + // otherwise we will determine if we can set default index options. If we can't, we won't persist any index_options + // and the field will use the defaults for the dense_vector field. if (indexOptions != null) { DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions = (DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions(); @@ -1208,7 +1236,6 @@ static DenseVectorFieldMapper.DenseVectorIndexOptions defaultDenseVectorIndexOpt // As embedding models for text perform better with BBQ, we aggressively default semantic_text fields to use optimized index // options if (indexVersionDefaultsToBbqHnsw(indexVersionCreated)) { - DenseVectorFieldMapper.DenseVectorIndexOptions defaultBbqHnswIndexOptions = defaultBbqHnswDenseVectorIndexOptions(); return defaultBbqHnswIndexOptions.validate(modelSettings.elementType(), modelSettings.dimensions(), false) ? defaultBbqHnswIndexOptions @@ -1230,11 +1257,24 @@ static DenseVectorFieldMapper.DenseVectorIndexOptions defaultBbqHnswDenseVectorI return new DenseVectorFieldMapper.BBQHnswIndexOptions(m, efConstruction, rescoreVector); } - static SemanticTextIndexOptions defaultBbqHnswSemanticTextIndexOptions() { - return new SemanticTextIndexOptions( - SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR, - defaultBbqHnswDenseVectorIndexOptions() - ); + static SemanticTextIndexOptions defaultIndexOptions(IndexVersion indexVersionCreated, MinimalServiceSettings modelSettings) { + + if (modelSettings == null) { + return null; + } + + SemanticTextIndexOptions defaultIndexOptions = null; + if (modelSettings.taskType() == TaskType.TEXT_EMBEDDING) { + DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions = defaultDenseVectorIndexOptions( + indexVersionCreated, + modelSettings + ); + defaultIndexOptions = denseVectorIndexOptions == null + ? null + : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR, denseVectorIndexOptions); + } + + return defaultIndexOptions; } private static boolean canMergeModelSettings(MinimalServiceSettings previous, MinimalServiceSettings current, Conflicts conflicts) { diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextIndexOptions.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextIndexOptions.java index c062adad2f551..db647499f446f 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextIndexOptions.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextIndexOptions.java @@ -20,6 +20,7 @@ import java.util.Arrays; import java.util.Locale; import java.util.Map; +import java.util.Objects; /** * Represents index options for a semantic_text field. @@ -50,6 +51,25 @@ public IndexOptions indexOptions() { return indexOptions; } + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + SemanticTextIndexOptions otherSemanticTextIndexOptions = (SemanticTextIndexOptions) other; + return type == otherSemanticTextIndexOptions.type && Objects.equals(indexOptions, otherSemanticTextIndexOptions.indexOptions); + } + + @Override + public int hashCode() { + return Objects.hash(type, indexOptions); + } + public enum SupportedIndexOptions { DENSE_VECTOR("dense_vector") { @Override diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml index 5cc0d83685169..637087071b8c1 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml @@ -833,3 +833,147 @@ setup: type: int8_flat - match: { status: 400 } + + +--- +"Displaying default index_options with and without include_defaults": + - requires: + cluster_features: "semantic_text.index_options_with_defaults" + reason: Index options defaults support introduced in 9.2.0 + + # Semantic text defaults to BBQ HNSW starting in 8.19.0/9.1.0 + - do: + indices.create: + index: test-index-options-dense + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: dense-inference-id-compatible-with-bbq + + - do: + indices.get_mapping: + index: test-index-options-dense + + - not_exists: test-index-options-dense.mappings.properties.semantic_field.index_options + + - do: + indices.get_field_mapping: + index: test-index-options-dense + fields: semantic_field + include_defaults: true + + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" } + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 } + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 } + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 } + + # Validate that actually specifying the same values as our defaults will still serialize the user provided index_options + - do: + indices.create: + index: test-index-options-dense2 + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: dense-inference-id-compatible-with-bbq + index_options: + dense_vector: + type: bbq_hnsw + m: 16 + ef_construction: 100 + rescore_vector: + oversample: 3 + + - do: + indices.get_mapping: + index: test-index-options-dense2 + + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.type": "bbq_hnsw" } + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.m": 16 } + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.ef_construction": 100 } + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 } + + - do: + indices.get_field_mapping: + index: test-index-options-dense2 + fields: semantic_field + include_defaults: true + + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" } + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 } + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 } + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 } + + # Indices not compatible with BBQ for whatever reason will fall back to whatever `dense_vector` defaults are. + - do: + indices.create: + index: test-index-options-dense-no-bbq + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: dense-inference-id + + - do: + indices.get_mapping: + index: test-index-options-dense-no-bbq + + - not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options + + - do: + indices.get_field_mapping: + index: test-index-options-dense-no-bbq + fields: semantic_field + include_defaults: true + + - not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options + + # Sparse embeddings models do not have index options for semantic_text in 8.19/9.1. + - do: + indices.create: + index: test-index-options-sparse + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: sparse-inference-id + + - do: + indices.get_mapping: + index: test-index-options-sparse + + - not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options + + - do: + indices.get_field_mapping: + index: test-index-options-sparse + fields: semantic_field + include_defaults: true + + - not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options + diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml index b089d8c439330..1121958b39ed5 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml @@ -736,3 +736,146 @@ setup: type: int8_flat - match: { status: 400 } + +--- +"Displaying default index_options with and without include_defaults": + - requires: + cluster_features: "semantic_text.index_options_with_defaults" + reason: Index options defaults support introduced in 9.2.0 + + # Semantic text defaults to BBQ HNSW starting in 8.19.0/9.1.0 + - do: + indices.create: + index: test-index-options-dense + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: dense-inference-id-compatible-with-bbq + + - do: + indices.get_mapping: + index: test-index-options-dense + + - not_exists: test-index-options-dense.mappings.properties.semantic_field.index_options + + - do: + indices.get_field_mapping: + index: test-index-options-dense + fields: semantic_field + include_defaults: true + + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" } + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 } + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 } + - match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 } + + # Validate that actually specifying the same values as our defaults will still serialize the user provided index_options + - do: + indices.create: + index: test-index-options-dense2 + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: dense-inference-id-compatible-with-bbq + index_options: + dense_vector: + type: bbq_hnsw + m: 16 + ef_construction: 100 + rescore_vector: + oversample: 3 + + - do: + indices.get_mapping: + index: test-index-options-dense2 + + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.type": "bbq_hnsw" } + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.m": 16 } + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.ef_construction": 100 } + - match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 } + + - do: + indices.get_field_mapping: + index: test-index-options-dense2 + fields: semantic_field + include_defaults: true + + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" } + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 } + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 } + - match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 } + + # Indices not compatible with BBQ for whatever reason will fall back to whatever `dense_vector` defaults are. + - do: + indices.create: + index: test-index-options-dense-no-bbq + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: dense-inference-id + + - do: + indices.get_mapping: + index: test-index-options-dense-no-bbq + + - not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options + + - do: + indices.get_field_mapping: + index: test-index-options-dense-no-bbq + fields: semantic_field + include_defaults: true + + - not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options + + # Sparse embeddings models do not have index options for semantic_text in 8.19/9.1. + - do: + indices.create: + index: test-index-options-sparse + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + semantic_field: + type: semantic_text + inference_id: sparse-inference-id + + - do: + indices.get_mapping: + index: test-index-options-sparse + + - not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options + + - do: + indices.get_field_mapping: + index: test-index-options-sparse + fields: semantic_field + include_defaults: true + + - not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options +