diff --git a/docs/changelog/135845.yaml b/docs/changelog/135845.yaml new file mode 100644 index 0000000000000..032a01a5b3542 --- /dev/null +++ b/docs/changelog/135845.yaml @@ -0,0 +1,5 @@ +pr: 135845 +summary: Fix for creating semantic_text fields on pre-8.11 indices crashing Elasticsearch +area: Mapping +type: bug +issues: [] diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java index eed68d4c3ac0c..66b66c62d8e60 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java @@ -67,7 +67,6 @@ import static org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase.randomNormalizedVector; import static org.elasticsearch.index.codec.vectors.diskbbq.ES920DiskBBQVectorsFormat.DYNAMIC_VISIT_RATIO; import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DEFAULT_OVERSAMPLE; -import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.INDEXED_BY_DEFAULT_INDEX_VERSION; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -106,7 +105,7 @@ private void indexMapping(XContentBuilder b, IndexVersion indexVersion) throws I if (elementType != ElementType.FLOAT) { b.field("element_type", elementType.toString()); } - if (indexVersion.onOrAfter(INDEXED_BY_DEFAULT_INDEX_VERSION) || indexed) { + if (indexVersion.onOrAfter(DenseVectorFieldMapper.INDEXED_BY_DEFAULT_INDEX_VERSION) || indexed) { // Serialize if it's new index version, or it was not the default for previous indices b.field("index", indexed); } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index ddebc3938ea6a..1a6c455258f94 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -127,7 +127,7 @@ */ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFieldMapper { private static final Logger logger = LogManager.getLogger(SemanticTextFieldMapper.class); - public static final String UNSUPPORTED_INDEX_MESSAGE = "[semantic_text] is available on indices created with 8.11 or higher."; + public static final NodeFeature SEMANTIC_TEXT_IN_OBJECT_FIELD_FIX = new NodeFeature("semantic_text.in_object_field_fix"); public static final NodeFeature SEMANTIC_TEXT_SINGLE_FIELD_UPDATE_FIX = new NodeFeature("semantic_text.single_field_update_fix"); public static final NodeFeature SEMANTIC_TEXT_DELETE_FIX = new NodeFeature("semantic_text.delete_fix"); @@ -153,6 +153,12 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie public static final String CONTENT_TYPE = "semantic_text"; public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID; + public static final String UNSUPPORTED_INDEX_MESSAGE = "[" + + CONTENT_TYPE + + "] is available on indices created with 8.11 or higher. Please create a new index to use [" + + CONTENT_TYPE + + "]"; + public static final float DEFAULT_RESCORE_OVERSAMPLE = 3.0f; static final String INDEX_OPTIONS_FIELD = "index_options"; @@ -166,9 +172,6 @@ public static final TypeParser parser(Supplier modelRegistry) { public static BiConsumer validateParserContext(String type) { return (n, c) -> { - if (c.getIndexSettings().getIndexVersionCreated().before(NEW_SPARSE_VECTOR)) { - throw new UnsupportedOperationException(UNSUPPORTED_INDEX_MESSAGE); - } if (InferenceMetadataFieldsMapper.isEnabled(c.getIndexSettings().getSettings()) == false) { notInMultiFields(type).accept(n, c); } @@ -588,16 +591,33 @@ SemanticTextField parseSemanticTextField(DocumentParserContext context) throws I if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { return null; } + + SemanticTextField semanticTextField; boolean isWithinLeaf = context.path().isWithinLeafObject(); try { context.path().setWithinLeafObject(true); - return SemanticTextField.parse( + semanticTextField = SemanticTextField.parse( context.parser(), new SemanticTextField.ParserContext(fieldType().useLegacyFormat, fullPath(), context.parser().contentType()) ); } finally { context.path().setWithinLeafObject(isWithinLeaf); } + + IndexVersion indexCreatedVersion = context.indexSettings().getIndexVersionCreated(); + if (semanticTextField != null + && semanticTextField.inference().modelSettings() != null + && indexCreatedVersion.before(NEW_SPARSE_VECTOR)) { + // Explicitly fail to parse semantic text fields that meet the following criteria: + // - Are in pre 8.11 indices + // - Have model settings, indicating that they have embeddings to be indexed + // + // We can't fail earlier than this because it causes pre 8.11 indices with semantic text fields to either be in red state or + // cause Elasticsearch to not launch. + throw new UnsupportedOperationException(UNSUPPORTED_INDEX_MESSAGE); + } + + return semanticTextField; } void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextField field, XContentLocation xContentLocation) @@ -1301,13 +1321,20 @@ private static void configureDenseVectorMapperBuilder( MinimalServiceSettings modelSettings, SemanticTextIndexOptions indexOptions ) { - SimilarityMeasure similarity = modelSettings.similarity(); - if (similarity != null) { - switch (similarity) { - case COSINE -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.COSINE); - case DOT_PRODUCT -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.DOT_PRODUCT); - case L2_NORM -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.L2_NORM); - default -> throw new IllegalArgumentException("Unknown similarity measure in model_settings [" + similarity.name() + "]"); + // Skip setting similarity on pre 8.11 indices. It causes dense vector field creation to fail because similarity can only be set + // on indexed fields, which is not done by default prior to 8.11. The fact that the dense vector field is partially configured is + // moot because we will explicitly fail to index docs into this semantic text field anyways. + if (indexVersionCreated.onOrAfter(NEW_SPARSE_VECTOR)) { + SimilarityMeasure similarity = modelSettings.similarity(); + if (similarity != null) { + switch (similarity) { + case COSINE -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.COSINE); + case DOT_PRODUCT -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.DOT_PRODUCT); + case L2_NORM -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.L2_NORM); + default -> throw new IllegalArgumentException( + "Unknown similarity measure in model_settings [" + similarity.name() + "]" + ); + } } } diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java index 838e4576716ff..98a5d93b1c85f 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java @@ -415,57 +415,6 @@ public void testInvalidTaskTypes() { } } - @Override - protected IndexVersion boostNotAllowedIndexVersion() { - return IndexVersions.NEW_SPARSE_VECTOR; - } - - public void testOldIndexSemanticTextDenseVectorRaisesError() throws IOException { - final String fieldName = "field"; - final XContentBuilder fieldMapping = fieldMapping(b -> { - b.field("type", "semantic_text"); - b.field(INFERENCE_ID_FIELD, "test_inference_id"); - b.startObject("model_settings"); - b.field("task_type", "text_embedding"); - b.field("dimensions", 384); - b.field("similarity", "cosine"); - b.field("element_type", "float"); - b.endObject(); - }); - assertOldIndexUnsupported(fieldMapping); - } - - public void testOldIndexSemanticTextMinimalMappingRaisesError() throws IOException { - final XContentBuilder fieldMapping = fieldMapping(this::minimalMapping); - assertOldIndexUnsupported(fieldMapping); - } - - public void testOldIndexSemanticTextSparseVersionRaisesError() throws IOException { - final XContentBuilder fieldMapping = fieldMapping(b -> { - b.field("type", "semantic_text"); - b.field("inference_id", "another_inference_id"); - b.startObject("model_settings"); - b.field("task_type", "sparse_embedding"); - b.endObject(); - }); - assertOldIndexUnsupported(fieldMapping); - } - - private void assertOldIndexUnsupported(XContentBuilder fieldMapping) { - - MapperParsingException exception = assertThrows( - MapperParsingException.class, - () -> createMapperService( - fieldMapping, - true, - IndexVersions.V_8_0_0, - IndexVersionUtils.getPreviousVersion(IndexVersions.NEW_SPARSE_VECTOR) - ) - ); - assertTrue(exception.getMessage().contains(UNSUPPORTED_INDEX_MESSAGE)); - assertTrue(exception.getRootCause() instanceof UnsupportedOperationException); - } - public void testMultiFieldsSupport() throws IOException { if (useLegacyFormat) { Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -1265,6 +1214,99 @@ public void testModelSettingsRequiredWithChunks() throws IOException { assertThat(ex.getMessage(), containsString("[model_settings] must be set for field [field] when chunks are provided")); } + public void testPre811IndexSemanticTextDenseVectorRaisesError() throws IOException { + Model model = TestModel.createRandomInstance(TaskType.TEXT_EMBEDDING); + String fieldName = randomAlphaOfLength(8); + + MapperService mapperService = createMapperService( + mapping( + b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", model.getInferenceEntityId()).endObject() + ), + true, + IndexVersions.V_8_0_0, + IndexVersionUtils.getPreviousVersion(IndexVersions.NEW_SPARSE_VECTOR) + ); + assertSemanticTextField(mapperService, fieldName, false, null, null); + + merge( + mapperService, + mapping( + b -> b.startObject(fieldName) + .field("type", "semantic_text") + .field("inference_id", model.getInferenceEntityId()) + .startObject("model_settings") + .field("task_type", TaskType.TEXT_EMBEDDING.toString()) + .field("dimensions", model.getServiceSettings().dimensions()) + .field("similarity", model.getServiceSettings().similarity()) + .field("element_type", model.getServiceSettings().elementType()) + .endObject() + .endObject() + ) + ); + assertSemanticTextField(mapperService, fieldName, true, null, null); + + DocumentMapper documentMapper = mapperService.documentMapper(); + DocumentParsingException e = assertThrows( + DocumentParsingException.class, + () -> documentMapper.parse( + source( + b -> addSemanticTextInferenceResults( + true, + b, + List.of(randomSemanticText(true, fieldName, model, null, List.of("foo", "bar"), XContentType.JSON)) + ) + ) + ) + ); + assertThat(e.getCause(), instanceOf(UnsupportedOperationException.class)); + assertThat(e.getCause().getMessage(), equalTo(UNSUPPORTED_INDEX_MESSAGE)); + } + + public void testPre811IndexSemanticTextSparseVectorRaisesError() throws IOException { + Model model = TestModel.createRandomInstance(TaskType.SPARSE_EMBEDDING); + String fieldName = randomAlphaOfLength(8); + + MapperService mapperService = createMapperService( + mapping( + b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", model.getInferenceEntityId()).endObject() + ), + true, + IndexVersions.V_8_0_0, + IndexVersionUtils.getPreviousVersion(IndexVersions.NEW_SPARSE_VECTOR) + ); + assertSemanticTextField(mapperService, fieldName, false, null, null); + + merge( + mapperService, + mapping( + b -> b.startObject(fieldName) + .field("type", "semantic_text") + .field("inference_id", model.getInferenceEntityId()) + .startObject("model_settings") + .field("task_type", TaskType.SPARSE_EMBEDDING.toString()) + .endObject() + .endObject() + ) + ); + assertSemanticTextField(mapperService, fieldName, true, null, null); + + DocumentMapper documentMapper = mapperService.documentMapper(); + DocumentParsingException e = assertThrows( + DocumentParsingException.class, + () -> documentMapper.parse( + source( + b -> addSemanticTextInferenceResults( + true, + b, + List.of(randomSemanticText(true, fieldName, model, null, List.of("foo", "bar"), XContentType.JSON)) + ) + ) + ) + ); + assertThat(e.getCause(), instanceOf(UnsupportedOperationException.class)); + assertThat(e.getCause().getMessage(), equalTo(UNSUPPORTED_INDEX_MESSAGE)); + } + private MapperService mapperServiceForFieldWithModelSettings(String fieldName, String inferenceId, MinimalServiceSettings modelSettings) throws IOException { return mapperServiceForFieldWithModelSettings(fieldName, inferenceId, null, modelSettings);