From 2a878cfb200b37e258daf0bdc3b0a740d73a8c82 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Mon, 7 Apr 2025 14:59:54 -0400 Subject: [PATCH 01/32] add tests with normalizer mappings --- .../elasticsearch/index/mapper/MapperServiceTestCase.java | 7 ++++++- .../datasource/DefaultMappingParametersHandler.java | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperServiceTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperServiceTestCase.java index b62e400826836..b197cad2e44a8 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperServiceTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperServiceTestCase.java @@ -41,6 +41,7 @@ import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.LowercaseNormalizer; import org.elasticsearch.index.analysis.NameOrDefinition; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.cache.bitset.BitsetFilterCache; @@ -132,7 +133,11 @@ protected IndexAnalyzers createIndexAnalyzers(IndexSettings indexSettings) { } protected static IndexAnalyzers createIndexAnalyzers() { - return IndexAnalyzers.of(Map.of("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer()))); + return IndexAnalyzers.of( + Map.of("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer())), + Map.of("lowercase", new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer())), + Map.of() + ); } protected static String randomIndexOptions() { diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java index bf87285b9c730..1d35306413e6f 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java @@ -108,7 +108,8 @@ private Supplier> keywordMapping( if (ESTestCase.randomDouble() <= 0.2) { injected.put("null_value", ESTestCase.randomAlphaOfLengthBetween(0, 10)); } - + // NOCOMMIT - randomize this + injected.put("normalizer", "lowercase"); return injected; }; } From 0720fe732d90bf6eb3299d4c30532a424bd8795e Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 10 Apr 2025 09:34:53 -0400 Subject: [PATCH 02/32] fix KeywordFieldBlockLoaderTests --- .../index/mapper/KeywordFieldMapper.java | 3 +++ .../KeywordFieldBlockLoaderTests.java | 22 +++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index e31e291a425cd..562ed539ce357 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -1263,12 +1263,15 @@ private String originalName() { @Override protected SyntheticSourceSupport syntheticSourceSupport() { + /* if (hasNormalizer()) { // NOTE: no matter if we have doc values or not we use fallback synthetic source // to store the original value whose doc values would be altered by the normalizer return SyntheticSourceSupport.FALLBACK; } + */ + if (fieldType.stored() || hasDocValues) { return new SyntheticSourceSupport.Native(() -> syntheticFieldLoader(fullPath(), leafName())); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java index c183f5a13fb19..fbb784e3cfe7c 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java @@ -28,21 +28,23 @@ public KeywordFieldBlockLoaderTests(Params params) { @SuppressWarnings("unchecked") @Override protected Object expected(Map fieldMapping, Object value, TestContext testContext) { - var nullValue = (String) fieldMapping.get("null_value"); + String nullValue = (String) fieldMapping.get("null_value"); - var ignoreAbove = fieldMapping.get("ignore_above") == null + int ignoreAbove = fieldMapping.get("ignore_above") == null ? Integer.MAX_VALUE : ((Number) fieldMapping.get("ignore_above")).intValue(); + String normalizerName = (String) fieldMapping.get("normalizer"); + if (value == null) { - return convert(null, nullValue, ignoreAbove); + return convert(null, nullValue, ignoreAbove, normalizerName); } if (value instanceof String s) { - return convert(s, nullValue, ignoreAbove); + return convert(s, nullValue, ignoreAbove, normalizerName); } - Function, Stream> convertValues = s -> s.map(v -> convert(v, nullValue, ignoreAbove)) + Function, Stream> convertValues = s -> s.map(v -> convert(v, nullValue, ignoreAbove, normalizerName)) .filter(Objects::nonNull); boolean hasDocValues = hasDocValues(fieldMapping, false); @@ -63,7 +65,7 @@ protected Object expected(Map fieldMapping, Object value, TestCo return maybeFoldList(resultList); } - private BytesRef convert(String value, String nullValue, int ignoreAbove) { + private BytesRef convert(String value, String nullValue, int ignoreAbove, String normalizer) { if (value == null) { if (nullValue != null) { value = nullValue; @@ -71,7 +73,13 @@ private BytesRef convert(String value, String nullValue, int ignoreAbove) { return null; } } - + if (Objects.equals(normalizer, "lowercase")) { + // hopefully not Turkish... + value = value.toLowerCase(); + } else if (normalizer != null) { + // we probably can't get here anyway, since MapperServiceTestCase only initializes the lowercase normalizer + throw new IllegalArgumentException("normalizer [" + normalizer + "] not supported for block loader tests"); + } return value.length() <= ignoreAbove ? new BytesRef(value) : null; } } From 625938f13b82d828caadfd091c5d7d5f96043445 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 10 Apr 2025 12:12:11 -0400 Subject: [PATCH 03/32] Update docs/changelog/126623.yaml --- docs/changelog/126623.yaml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/changelog/126623.yaml diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml new file mode 100644 index 0000000000000..b659b46a955d5 --- /dev/null +++ b/docs/changelog/126623.yaml @@ -0,0 +1,7 @@ +pr: 126623 +summary: Enable synthetic source on normalized keyword mappings +area: Mapping +type: enhancement +issues: + - 124369 + - 121358 From e26e543cc97739d9af7f3bd2f19abfda55758ede Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 10 Apr 2025 16:27:45 -0400 Subject: [PATCH 04/32] proof of concept --- .../matchers/source/FieldSpecificMatcher.java | 10 +++++--- .../matchers/source/SourceMatcher.java | 24 +++++++++---------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java index 29d5c9cce6d47..f2f9b0d57aecc 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java @@ -341,6 +341,10 @@ Object convert(Object value, Object nullValue) { if (value == null) { return nullValue; } + // NOCOMMIT - this is excessively crude + if (value instanceof String s) { + value = s.toLowerCase(Locale.ROOT); + } return value; } @@ -631,10 +635,10 @@ public MatchResult match( Map actualMapping, Map expectedMapping ) { - var nullValue = getNullValue(actualMapping, expectedMapping); + Object nullValue = getNullValue(actualMapping, expectedMapping); - var expectedNormalized = normalize(expected, nullValue); - var actualNormalized = normalize(actual, nullValue); + Set expectedNormalized = normalize(expected, nullValue); + Set actualNormalized = normalize(actual, nullValue); return actualNormalized.equals(expectedNormalized) ? MatchResult.match() diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java index 7e5b63e82525b..65142610145fb 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java @@ -76,10 +76,10 @@ public MatchResult match() { var sortedAndFlattenedExpected = expected.stream().map(s -> SourceTransforms.normalize(s, mappingLookup)).toList(); for (int i = 0; i < sortedAndFlattenedActual.size(); i++) { - var actual = sortedAndFlattenedActual.get(i); - var expected = sortedAndFlattenedExpected.get(i); + Map> actual = sortedAndFlattenedActual.get(i); + Map> expected = sortedAndFlattenedExpected.get(i); - var result = compareSource(actual, expected); + MatchResult result = compareSource(actual, expected); if (result.isMatch() == false) { var message = "Source matching failed at document id [" + i + "]. " + result.getMessage(); return MatchResult.noMatch(message); @@ -90,15 +90,15 @@ public MatchResult match() { } private MatchResult compareSource(Map> actual, Map> expected) { - for (var expectedFieldEntry : expected.entrySet()) { - var name = expectedFieldEntry.getKey(); + for (Map.Entry> expectedFieldEntry : expected.entrySet()) { + String name = expectedFieldEntry.getKey(); - var actualValues = actual.get(name); - var expectedValues = expectedFieldEntry.getValue(); + List actualValues = actual.get(name); + List expectedValues = expectedFieldEntry.getValue(); - var matchIncludingFieldSpecificMatchers = matchWithFieldSpecificMatcher(name, actualValues, expectedValues); + MatchResult matchIncludingFieldSpecificMatchers = matchWithFieldSpecificMatcher(name, actualValues, expectedValues); if (matchIncludingFieldSpecificMatchers.isMatch() == false) { - var message = "Source documents don't match for field [" + name + "]: " + matchIncludingFieldSpecificMatchers.getMessage(); + String message = "Source documents don't match for field [" + name + "]: " + matchIncludingFieldSpecificMatchers.getMessage(); return MatchResult.noMatch(message); } } @@ -106,7 +106,7 @@ private MatchResult compareSource(Map> actual, Map actualValues, List expectedValues) { - var actualFieldMapping = actualNormalizedMapping.get(fieldName); + MappingTransforms.FieldMapping actualFieldMapping = actualNormalizedMapping.get(fieldName); if (actualFieldMapping == null) { if (expectedNormalizedMapping.get(fieldName) != null // Special cases due to fields being defined in default mapping for logsdb index mode @@ -126,7 +126,7 @@ private MatchResult matchWithFieldSpecificMatcher(String fieldName, List throw new IllegalStateException("Field type is missing from leaf field Leaf field [" + fieldName + "] mapping parameters"); } - var expectedFieldMapping = expectedNormalizedMapping.get(fieldName); + MappingTransforms.FieldMapping expectedFieldMapping = expectedNormalizedMapping.get(fieldName); if (expectedFieldMapping == null) { throw new IllegalStateException("Leaf field [" + fieldName + "] is present in actual mapping but absent in expected mapping"); } else { @@ -144,7 +144,7 @@ private MatchResult matchWithFieldSpecificMatcher(String fieldName, List } } - var fieldSpecificMatcher = fieldSpecificMatchers.get(actualFieldType); + FieldSpecificMatcher fieldSpecificMatcher = fieldSpecificMatchers.get(actualFieldType); assert fieldSpecificMatcher != null : "Missing matcher for field type [" + actualFieldType + "]"; return fieldSpecificMatcher.match( From c8a002597d2cf6e95515c7619edbb4ccf6449e0c Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 10 Apr 2025 17:39:59 -0400 Subject: [PATCH 05/32] better, but not done --- .../matchers/source/FieldSpecificMatcher.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java index f2f9b0d57aecc..8f68d2caa8be4 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java @@ -327,6 +327,8 @@ Object convert(Object value, Object nullValue) { } class KeywordMatcher extends GenericMappingAwareMatcher { + String normalizer; + KeywordMatcher( XContentBuilder actualMappings, Settings.Builder actualSettings, @@ -336,13 +338,18 @@ class KeywordMatcher extends GenericMappingAwareMatcher { super("keyword", actualMappings, actualSettings, expectedMappings, expectedSettings); } + @Override + public MatchResult match(List actual, List expected, Map actualMapping, Map expectedMapping) { + this.normalizer = (String) FieldSpecificMatcher.getMappingParameter("normalizer", actualMapping, expectedMapping); + return super.match(actual, expected, actualMapping, expectedMapping); + } + @Override Object convert(Object value, Object nullValue) { if (value == null) { return nullValue; } - // NOCOMMIT - this is excessively crude - if (value instanceof String s) { + if (value instanceof String s && this.normalizer != null && this.normalizer.equals("lowercase")) { value = s.toLowerCase(Locale.ROOT); } From 6b20735af5ef9394c24aaf46d9f08dfab546fab6 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 10 Apr 2025 21:49:46 +0000 Subject: [PATCH 06/32] [CI] Auto commit changes from spotless --- .../matchers/source/FieldSpecificMatcher.java | 7 ++++++- .../datageneration/matchers/source/SourceMatcher.java | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java index 8f68d2caa8be4..d3381b32b1394 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java @@ -339,7 +339,12 @@ class KeywordMatcher extends GenericMappingAwareMatcher { } @Override - public MatchResult match(List actual, List expected, Map actualMapping, Map expectedMapping) { + public MatchResult match( + List actual, + List expected, + Map actualMapping, + Map expectedMapping + ) { this.normalizer = (String) FieldSpecificMatcher.getMappingParameter("normalizer", actualMapping, expectedMapping); return super.match(actual, expected, actualMapping, expectedMapping); } diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java index 65142610145fb..398967f9ea632 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/SourceMatcher.java @@ -98,7 +98,10 @@ private MatchResult compareSource(Map> actual, Map Date: Fri, 11 Apr 2025 10:38:04 -0400 Subject: [PATCH 07/32] account for normalizing null values --- .../mapper/blockloader/KeywordFieldBlockLoaderTests.java | 3 ++- .../datageneration/matchers/source/FieldSpecificMatcher.java | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java index fbb784e3cfe7c..a77f4274d5e7d 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java @@ -15,6 +15,7 @@ import org.elasticsearch.logsdb.datageneration.FieldType; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.Function; @@ -75,7 +76,7 @@ private BytesRef convert(String value, String nullValue, int ignoreAbove, String } if (Objects.equals(normalizer, "lowercase")) { // hopefully not Turkish... - value = value.toLowerCase(); + value = value.toLowerCase(Locale.ROOT); } else if (normalizer != null) { // we probably can't get here anyway, since MapperServiceTestCase only initializes the lowercase normalizer throw new IllegalArgumentException("normalizer [" + normalizer + "] not supported for block loader tests"); diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java index 8f68d2caa8be4..0d2c8a9543870 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java @@ -347,9 +347,11 @@ public MatchResult match(List actual, List expected, Map Date: Fri, 11 Apr 2025 10:45:47 -0400 Subject: [PATCH 08/32] remove code I'd only commened out --- .../elasticsearch/index/mapper/KeywordFieldMapper.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 562ed539ce357..aef65643fce6d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -1263,13 +1263,8 @@ private String originalName() { @Override protected SyntheticSourceSupport syntheticSourceSupport() { - /* - if (hasNormalizer()) { - // NOTE: no matter if we have doc values or not we use fallback synthetic source - // to store the original value whose doc values would be altered by the normalizer - return SyntheticSourceSupport.FALLBACK; - } - + /* NOTE: we allow enabling synthetic source on Keyword fields with a Normalizer, even though the returned synthetic value + may not perfectly match the original, pre-normalization, value. */ if (fieldType.stored() || hasDocValues) { From 949f42fe11fa89a1546e25d5806018d77ce688b9 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Fri, 11 Apr 2025 13:17:54 -0400 Subject: [PATCH 09/32] fix yaml tests --- .../rest-api-spec/test/mget/90_synthetic_source.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml index 8485aba0ecc6a..949d7db422aa0 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml @@ -118,7 +118,7 @@ keyword with normalizer: - match: { docs.0._id: "1" } - match: docs.0._source: - keyword: "the Quick Brown Fox jumps over the lazy Dog" + keyword: "the quick brown fox jumps over the lazy dog" keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog" keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog" @@ -126,7 +126,7 @@ keyword with normalizer: - match: { docs.1._id: "2" } - match: docs.1._source: - keyword: "The five BOXING wizards jump Quickly" + keyword: "the five boxing wizards jump quickly" keyword_with_ignore_above: "The five BOXING wizards jump Quickly" keyword_without_doc_values: "The five BOXING wizards jump Quickly" @@ -134,7 +134,7 @@ keyword with normalizer: - match: { docs.2._id: "3" } - match: docs.2._source: - keyword: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + keyword: [ "do or do not, there is no try", "may the force be with you!" ] keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] From 0ed6a2b57c0f44bb6c26de9c946116c2a68920fc Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 15 Apr 2025 11:50:28 -0400 Subject: [PATCH 10/32] add test for opting out via keep all --- .../test/mget/90_synthetic_source.yml | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml index 949d7db422aa0..c57e1b25e1a91 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml @@ -138,6 +138,93 @@ keyword with normalizer: keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] +--- +keyword with normalizer, source keep mode all: + - do: + indices.create: + index: test-keyword-with-normalizer + body: + settings: + analysis: + normalizer: + lowercase: + type: custom + filter: + - lowercase + index: + mapping.source.mode: synthetic + + mappings: + properties: + keyword: + type: keyword + normalizer: lowercase + synthetic_source_keep: all + keyword_with_ignore_above: + type: keyword + normalizer: lowercase + ignore_above: 10 + keyword_without_doc_values: + type: keyword + normalizer: lowercase + doc_values: false + + - do: + index: + index: test-keyword-with-normalizer + id: 1 + body: + keyword: "the Quick Brown Fox jumps over the lazy Dog" + keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog" + keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog" + + - do: + index: + index: test-keyword-with-normalizer + id: 2 + body: + keyword: "The five BOXING wizards jump Quickly" + keyword_with_ignore_above: "The five BOXING wizards jump Quickly" + keyword_without_doc_values: "The five BOXING wizards jump Quickly" + + - do: + index: + index: test-keyword-with-normalizer + id: 3 + body: + keyword: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + + - do: + mget: + index: test-keyword-with-normalizer + body: + ids: [ 1, 2, 3 ] + - match: { docs.0._index: "test-keyword-with-normalizer" } + - match: { docs.0._id: "1" } + - match: + docs.0._source: + keyword: "the Quick Brown Fox jumps over the lazy Dog" + keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog" + keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog" + + - match: { docs.1._index: "test-keyword-with-normalizer" } + - match: { docs.1._id: "2" } + - match: + docs.1._source: + keyword: "The five BOXING wizards jump Quickly" + keyword_with_ignore_above: "The five BOXING wizards jump Quickly" + keyword_without_doc_values: "The five BOXING wizards jump Quickly" + + - match: { docs.2._index: "test-keyword-with-normalizer" } + - match: { docs.2._id: "3" } + - match: + docs.2._source: + keyword: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ] + --- stored text: - requires: From 8d2969589401a51c7059e9358a6fc0790a67e9a1 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 27 May 2025 10:13:15 -0400 Subject: [PATCH 11/32] Update docs/changelog/126623.yaml --- docs/changelog/126623.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index b659b46a955d5..6a9abd18f63ec 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,7 +1,14 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: enhancement +type: "breaking, enhancement" issues: - 124369 - 121358 +breaking: + title: Enable synthetic source on normalized keyword mappings + area: Mapping + details: Please describe the details of this change for the release notes. You can + use asciidoc. + impact: Please describe the impact of this change to users + notable: false From e1f595829e737f89bda090b43c4ff32068fcd923 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 27 May 2025 13:52:42 -0400 Subject: [PATCH 12/32] fix merge semantic confilct --- .../datasource/DefaultMappingParametersHandler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java b/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java index 31e035a985457..7a853f8744826 100644 --- a/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java +++ b/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java @@ -103,7 +103,7 @@ private Supplier> keywordMapping(DataSourceRequest.LeafMappi mapping.put("null_value", ESTestCase.randomAlphaOfLengthBetween(0, 10)); } // NOCOMMIT - randomize this - injected.put("normalizer", "lowercase"); + mapping.put("normalizer", "lowercase"); return mapping; }; } From 70dd8e7c897553ddf35bdb6bc81d016b201ff333 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 27 May 2025 14:11:51 -0400 Subject: [PATCH 13/32] changelog --- docs/changelog/126623.yaml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index 6a9abd18f63ec..3564e286b2f0b 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,14 +1,24 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: "breaking, enhancement" +type: "breaking" issues: - 124369 - 121358 breaking: title: Enable synthetic source on normalized keyword mappings area: Mapping - details: Please describe the details of this change for the release notes. You can - use asciidoc. - impact: Please describe the impact of this change to users + details: This changes the default behavior for Synthetic Source on + keyword fields using normalizers. Prior to this change, normalized + keywords were always stored to allow returning the non-normalized + values. Under this change, such field will NOT be stored (i.e + they will be synthesized from the index when returning source, + like all other synthetic source fields). This should result in + considerable space improvement for this use case. + + Users can opt out of this behavior on a per-field basis by setting + `synthetic_source_keep: all` on the field. + + impact: By default, normalized keyword fields in synthetic source + indices will no longer return the non-normalized value in the source. notable: false From 0984db01016090d516bfde7c60c6170700bd04b2 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 28 May 2025 11:25:16 -0400 Subject: [PATCH 14/32] add test feature to gate the tests --- .../test/mget/90_synthetic_source.yml | 60 ++++++++++--------- .../index/mapper/KeywordFieldMapper.java | 5 ++ .../index/mapper/MapperFeatures.java | 3 +- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml index c57e1b25e1a91..4f22893619106 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml @@ -6,7 +6,7 @@ setup: --- keyword: - requires: - cluster_features: ["gte_v8.4.0"] + cluster_features: [ "gte_v8.4.0" ] reason: introduced in 8.4.0 - do: @@ -23,15 +23,15 @@ keyword: - do: index: - index: test - id: 1 + index: test + id: 1 body: kwd: foo - do: index: - index: test - id: 2 + index: test + id: 2 body: kwd: bar @@ -40,21 +40,25 @@ keyword: mget: index: test body: - ids: [1, 2] - - match: {docs.0._index: "test"} - - match: {docs.0._id: "1"} + ids: [ 1, 2 ] + - match: { docs.0._index: "test" } + - match: { docs.0._id: "1" } - match: docs.0._source: kwd: foo - - match: {docs.1._index: "test"} - - match: {docs.1._id: "2"} + - match: { docs.1._index: "test" } + - match: { docs.1._id: "2" } - match: docs.1._source: kwd: bar --- keyword with normalizer: + - requires: + - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + - reason: "Behavior changed in #126623" + - do: indices.create: index: test-keyword-with-normalizer @@ -113,7 +117,7 @@ keyword with normalizer: mget: index: test-keyword-with-normalizer body: - ids: [ 1, 2, 3 ] + ids: [ 1, 2, 3 ] - match: { docs.0._index: "test-keyword-with-normalizer" } - match: { docs.0._id: "1" } - match: @@ -200,7 +204,7 @@ keyword with normalizer, source keep mode all: mget: index: test-keyword-with-normalizer body: - ids: [ 1, 2, 3 ] + ids: [ 1, 2, 3 ] - match: { docs.0._index: "test-keyword-with-normalizer" } - match: { docs.0._id: "1" } - match: @@ -228,7 +232,7 @@ keyword with normalizer, source keep mode all: --- stored text: - requires: - cluster_features: ["gte_v8.5.0"] + cluster_features: [ "gte_v8.5.0" ] reason: introduced in 8.5.0 - do: @@ -246,15 +250,15 @@ stored text: - do: index: - index: test - id: 1 + index: test + id: 1 body: text: the quick brown fox - do: index: - index: test - id: 2 + index: test + id: 2 body: text: jumped over the lazy dog @@ -262,15 +266,15 @@ stored text: mget: index: test body: - ids: [1, 2] - - match: {docs.0._index: "test"} - - match: {docs.0._id: "1"} + ids: [ 1, 2 ] + - match: { docs.0._index: "test" } + - match: { docs.0._id: "1" } - match: docs.0._source: text: the quick brown fox - - match: {docs.1._index: "test"} - - match: {docs.1._id: "2"} + - match: { docs.1._index: "test" } + - match: { docs.1._id: "2" } - match: docs.1._source: text: jumped over the lazy dog @@ -278,7 +282,7 @@ stored text: --- force_synthetic_source_ok: - requires: - cluster_features: ["gte_v8.4.0"] + cluster_features: [ "gte_v8.4.0" ] reason: introduced in 8.4.0 - do: @@ -297,15 +301,15 @@ force_synthetic_source_ok: - do: index: - index: test - id: 1 + index: test + id: 1 body: obj.kwd: foo - do: index: - index: test - id: 2 + index: test + id: 2 body: obj: kwd: bar @@ -315,7 +319,7 @@ force_synthetic_source_ok: mget: index: test body: - ids: [1, 2] + ids: [ 1, 2 ] - match: docs.0._source: obj.kwd: foo diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index d278678848e4a..d63985d917ea5 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -39,6 +39,7 @@ import org.elasticsearch.common.lucene.search.AutomatonQueries; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.core.Nullable; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSortConfig; import org.elasticsearch.index.IndexVersion; @@ -92,6 +93,10 @@ */ public final class KeywordFieldMapper extends FieldMapper { + public static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature( + "mapper.keyword.keyword_normalizer_synthetic_source" + ); + private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class); public static final String CONTENT_TYPE = "keyword"; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java index 18ae1fa802df6..5b0e8825cff21 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java @@ -68,7 +68,8 @@ public Set getTestFeatures() { DateFieldMapper.INVALID_DATE_FIX, NPE_ON_DIMS_UPDATE_FIX, RESCORE_ZERO_VECTOR_QUANTIZED_VECTOR_MAPPING, - USE_DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ + USE_DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ, + KeywordFieldMapper.KEYWORD_NORMALIZER_SYNTHETIC_SOURCE ); } } From 63338d720a4c25dc4d198788e1a127515ed9a2f4 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 28 May 2025 11:45:44 -0400 Subject: [PATCH 15/32] fix changelog, again --- docs/changelog/126623.yaml | 2 +- .../resources/rest-api-spec/test/mget/90_synthetic_source.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index 3564e286b2f0b..d51b66b86078a 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -17,7 +17,7 @@ breaking: considerable space improvement for this use case. Users can opt out of this behavior on a per-field basis by setting - `synthetic_source_keep: all` on the field. + `synthetic_source_keep` to `all` on the field. impact: By default, normalized keyword fields in synthetic source indices will no longer return the non-normalized value in the source. diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml index 4f22893619106..127fb197c2818 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml @@ -58,7 +58,6 @@ keyword with normalizer: - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - - do: indices.create: index: test-keyword-with-normalizer From 8c91da5c5432aeb645f74a47b666132365a85a7f Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 28 May 2025 12:27:10 -0400 Subject: [PATCH 16/32] fix yaml --- .../resources/rest-api-spec/test/mget/90_synthetic_source.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml index 127fb197c2818..f63d867e1fff5 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml @@ -56,8 +56,8 @@ keyword: --- keyword with normalizer: - requires: - - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - - reason: "Behavior changed in #126623" + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: indices.create: index: test-keyword-with-normalizer From 9110abea94260085ac03ae200606cac5b9e5cbf5 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 28 May 2025 14:01:22 -0400 Subject: [PATCH 17/32] remove no commit --- .../datasource/DefaultMappingParametersHandler.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java b/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java index 7a853f8744826..3cb5c9be701f4 100644 --- a/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java +++ b/test/framework/src/main/java/org/elasticsearch/datageneration/datasource/DefaultMappingParametersHandler.java @@ -102,8 +102,9 @@ private Supplier> keywordMapping(DataSourceRequest.LeafMappi if (ESTestCase.randomDouble() <= 0.2) { mapping.put("null_value", ESTestCase.randomAlphaOfLengthBetween(0, 10)); } - // NOCOMMIT - randomize this - mapping.put("normalizer", "lowercase"); + if (ESTestCase.randomBoolean()) { + mapping.put("normalizer", "lowercase"); + } return mapping; }; } From c7472197f544028ae09fff868ee9c780dad3d312 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 28 May 2025 16:18:41 -0400 Subject: [PATCH 18/32] skip the rest compatbility test for the breaking change --- rest-api-spec/build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle index 8ae4999647fe1..6d5a48a98adbd 100644 --- a/rest-api-spec/build.gradle +++ b/rest-api-spec/build.gradle @@ -89,4 +89,5 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task -> task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively") task.skipTest("cluster.info/30_info_thread_pool/Cluster HTTP Info", "The search_throttled thread pool has been removed") task.skipTest("synonyms/80_synonyms_from_index/Fail loading synonyms from index if synonyms_set doesn't exist", "Synonyms do no longer fail if the synonyms_set doesn't exist") + task.skipTest("mget/90_synthetic_source/keyword with normalizer", "Normalized keywords now use synthetic source") }) From 35153c1029df84e8f33429671ee60cf6840dab56 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 3 Jun 2025 11:05:15 -0400 Subject: [PATCH 19/32] unvar --- .../blockloader/TextFieldWithParentBlockLoaderTests.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java index 6343aeea2d9de..b985b4583677b 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java @@ -11,6 +11,7 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import org.elasticsearch.datageneration.DataGeneratorSpecification; import org.elasticsearch.datageneration.DocumentGenerator; import org.elasticsearch.datageneration.FieldType; import org.elasticsearch.datageneration.MappingGenerator; @@ -49,7 +50,7 @@ public TextFieldWithParentBlockLoaderTests(BlockLoaderTestCase.Params params) { // of text multi field in a keyword field. public void testBlockLoaderOfParentField() throws IOException { var template = new Template(Map.of("parent", new Template.Leaf("parent", FieldType.KEYWORD.toString()))); - var specification = buildSpecification(List.of(new DataSourceHandler() { + DataGeneratorSpecification specification = buildSpecification(List.of(new DataSourceHandler() { @Override public DataSourceResponse.LeafMappingParametersGenerator handle(DataSourceRequest.LeafMappingParametersGenerator request) { // This is a bit tricky meta-logic. @@ -101,7 +102,7 @@ public DataSourceResponse.LeafMappingParametersGenerator handle(DataSourceReques var fieldValue = document.get("parent"); Object expected = expected(fieldMapping, fieldValue, new BlockLoaderTestCase.TestContext(false, true)); - var mappingXContent = XContentBuilder.builder(XContentType.JSON.xContent()).map(mapping.raw()); + XContentBuilder mappingXContent = XContentBuilder.builder(XContentType.JSON.xContent()).map(mapping.raw()); var mapperService = params.syntheticSource() ? createSytheticSourceMapperService(mappingXContent) : createMapperService(mappingXContent); From aed9477e6540fbc78a3ece3a8a1444b3a3fa27f9 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 4 Jun 2025 09:05:36 -0400 Subject: [PATCH 20/32] Feedback from MVG on fields with both keyword and text subfields --- .../elasticsearch/index/mapper/TextFieldMapper.java | 11 ++++++----- .../mapper/blockloader/TextFieldBlockLoaderTests.java | 3 +-- .../TextFieldWithParentBlockLoaderTests.java | 9 +++++++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 08cef586e1438..64cc2b7f77446 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -1008,17 +1008,21 @@ protected String delegatingTo() { } }; } + if (isStored()) { + return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name()); + } /* * If this is a sub-text field try and return the parent's loader. Text * fields will always be slow to load and if the parent is exact then we * should use that instead. */ + // TODO: should this be removed? I think SyntheticSourceHelper already does this: String parentField = blContext.parentField(name()); if (parentField != null) { MappedFieldType parent = blContext.lookup().fieldType(parentField); if (parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent; - if (kwd.hasNormalizer() == false && (kwd.hasDocValues() || kwd.isStored())) { + if (kwd.hasDocValues() || kwd.isStored()) { return new BlockLoader.Delegating(kwd.blockLoader(blContext)) { @Override protected String delegatingTo() { @@ -1028,9 +1032,6 @@ protected String delegatingTo() { } } } - if (isStored()) { - return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name()); - } // _ignored_source field will contain entries for this field if it is not stored // and there is no syntheticSourceDelegate. @@ -1579,7 +1580,7 @@ public static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterabl for (Mapper sub : multiFields) { if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { KeywordFieldMapper kwd = (KeywordFieldMapper) sub; - if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) { + if (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored()) { return kwd; } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java index 77c42740451ee..99b98bbda606a 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java @@ -39,14 +39,13 @@ public static Object expectedValue(Map fieldMapping, Object valu var fields = (Map) fieldMapping.get("fields"); if (fields != null) { var keywordMultiFieldMapping = (Map) fields.get("kwd"); - Object normalizer = fields.get("normalizer"); boolean docValues = hasDocValues(keywordMultiFieldMapping, true); boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true); Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above"); // See TextFieldMapper.SyntheticSourceHelper#getKeywordFieldMapperForSyntheticSource // and TextFieldMapper#canUseSyntheticSourceDelegateForLoading(). - boolean usingSyntheticSourceDelegate = normalizer == null && (docValues || store); + boolean usingSyntheticSourceDelegate = docValues || store; boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null; if (canUseSyntheticSourceDelegateForLoading) { return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java index b985b4583677b..df0dbe6a3b08f 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java @@ -118,13 +118,18 @@ private Object expected(Map fieldMapping, Object value, BlockLoa boolean docValues = hasDocValues(fieldMapping, true); boolean store = fieldMapping.getOrDefault("store", false).equals(true); - if (normalizer == null && (docValues || store)) { + // if text sub field is stored, then always use that: + var textFieldMapping = (Map) ((Map) fieldMapping.get("fields")).get("mf"); + if (textFieldMapping.getOrDefault("store", false).equals(true)) { + return TextFieldBlockLoaderTests.expectedValue(textFieldMapping, value, params, testContext); + } + + if (docValues || store) { // we are using block loader of the parent field return KeywordFieldBlockLoaderTests.expectedValue(fieldMapping, value, params, testContext); } // we are using block loader of the text field itself - var textFieldMapping = (Map) ((Map) fieldMapping.get("fields")).get("mf"); return TextFieldBlockLoaderTests.expectedValue(textFieldMapping, value, params, testContext); } } From 270f066d49507cae1be2a31cd44fcd684828154b Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 4 Jun 2025 14:49:04 -0400 Subject: [PATCH 21/32] fix exact subfield tests --- .../test/esql/81_text_exact_subfields.yml | 101 ++++++++++++------ 1 file changed, 67 insertions(+), 34 deletions(-) diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml index 72e5f4728edc8..f56a2ad1c4546 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml @@ -1,7 +1,7 @@ --- setup: - requires: - cluster_features: ["gte_v8.13.0"] + cluster_features: [ "gte_v8.13.0" ] reason: "Exact subfields fixed in v 8.13" test_runner_features: allowed_warnings_regex - do: @@ -45,12 +45,15 @@ setup: refresh: true body: - { "index": { } } - - { "emp_no": 10, "text_ignore_above":"this is a long text", "text_normalizer": "CamelCase", "non_indexed": "foo"} + - { "emp_no": 10, "text_ignore_above": "this is a long text", "text_normalizer": "CamelCase", "non_indexed": "foo" } - { "index": { } } - - { "emp_no": 20, "text_ignore_above":"this", "text_normalizer": "abc", "non_indexed": "bar"} + - { "emp_no": 20, "text_ignore_above": "this", "text_normalizer": "abc", "non_indexed": "bar" } --- "extract": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -72,11 +75,14 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } --- "filter ignore above": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -98,7 +104,7 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - do: allowed_warnings_regex: @@ -121,7 +127,7 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - do: @@ -191,10 +197,13 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } --- -"filter with normalizer": +"filter with normalizer 1": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -216,15 +225,19 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } +--- +"filter with normalizer 2": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" esql.query: body: - query: 'from test | where text_normalizer == text_normalizer.raw | keep text_ignore_above, text_ignore_above.raw, text_normalizer, text_normalizer.raw, non_indexed, non_indexed.raw' + query: 'from test | where text_normalizer == text_normalizer.raw | SORT emp_no | keep text_ignore_above, text_ignore_above.raw, text_normalizer, text_normalizer.raw, non_indexed, non_indexed.raw' - match: { columns.0.name: "text_ignore_above" } - match: { columns.0.type: "text" } @@ -239,11 +252,15 @@ setup: - match: { columns.5.name: "non_indexed.raw" } - match: { columns.5.type: "keyword" } - - length: { values: 1 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - length: { values: 2 } + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } --- "sort ignore above": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -265,8 +282,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - do: allowed_warnings_regex: @@ -289,8 +306,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } - do: allowed_warnings_regex: @@ -313,8 +330,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - do: allowed_warnings_regex: @@ -337,11 +354,14 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } --- -"sort normalizer": +"sort normalizer 1": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -363,9 +383,14 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } +--- +"sort normalizer 2": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -387,9 +412,14 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } +--- +"sort normalizer 3": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -411,12 +441,15 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } --- "non indexed": + - requires: + cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" + reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -438,8 +471,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - do: allowed_warnings_regex: @@ -462,8 +495,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } - do: allowed_warnings_regex: @@ -486,4 +519,4 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } From 5039f3cae17f9689cf39933795e836de57c984fc Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 4 Jun 2025 15:42:08 -0400 Subject: [PATCH 22/32] skip BWC for tests we expect to break --- rest-api-spec/build.gradle | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle index 6d5a48a98adbd..b3c0b08e4dc92 100644 --- a/rest-api-spec/build.gradle +++ b/rest-api-spec/build.gradle @@ -90,4 +90,10 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task -> task.skipTest("cluster.info/30_info_thread_pool/Cluster HTTP Info", "The search_throttled thread pool has been removed") task.skipTest("synonyms/80_synonyms_from_index/Fail loading synonyms from index if synonyms_set doesn't exist", "Synonyms do no longer fail if the synonyms_set doesn't exist") task.skipTest("mget/90_synthetic_source/keyword with normalizer", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/filter ignore above", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/filter with normalizer", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/non indexed", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/extract", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/sort ignore above", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/sort normalizer", "Normalized keywords now use synthetic source") }) From a739e87a74f0142ae5192a9e70533673559bc7da Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 5 Jun 2025 10:33:11 -0400 Subject: [PATCH 23/32] put the Rest Compatibility skip in the right place --- rest-api-spec/build.gradle | 6 ------ x-pack/plugin/build.gradle | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle index b3c0b08e4dc92..6d5a48a98adbd 100644 --- a/rest-api-spec/build.gradle +++ b/rest-api-spec/build.gradle @@ -90,10 +90,4 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task -> task.skipTest("cluster.info/30_info_thread_pool/Cluster HTTP Info", "The search_throttled thread pool has been removed") task.skipTest("synonyms/80_synonyms_from_index/Fail loading synonyms from index if synonyms_set doesn't exist", "Synonyms do no longer fail if the synonyms_set doesn't exist") task.skipTest("mget/90_synthetic_source/keyword with normalizer", "Normalized keywords now use synthetic source") - task.skipTest("esql/81_text_exact_subfields/filter ignore above", "Normalized keywords now use synthetic source") - task.skipTest("esql/81_text_exact_subfields/filter with normalizer", "Normalized keywords now use synthetic source") - task.skipTest("esql/81_text_exact_subfields/non indexed", "Normalized keywords now use synthetic source") - task.skipTest("esql/81_text_exact_subfields/extract", "Normalized keywords now use synthetic source") - task.skipTest("esql/81_text_exact_subfields/sort ignore above", "Normalized keywords now use synthetic source") - task.skipTest("esql/81_text_exact_subfields/sort normalizer", "Normalized keywords now use synthetic source") }) diff --git a/x-pack/plugin/build.gradle b/x-pack/plugin/build.gradle index 51a99c3d47c4f..10f378ec24b2a 100644 --- a/x-pack/plugin/build.gradle +++ b/x-pack/plugin/build.gradle @@ -134,6 +134,12 @@ tasks.named("yamlRestCompatTestTransform").configure({ task -> task.skipTest("esql/90_non_indexed/fetch", "Temporary until backported") task.skipTest("esql/63_enrich_int_range/Invalid age as double", "TODO: require disable allow_partial_results") task.skipTest("esql/191_lookup_join_on_datastreams/data streams not supported in LOOKUP JOIN", "Added support for aliases in JOINs") + task.skipTest("esql/81_text_exact_subfields/filter ignore above", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/filter with normalizer", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/non indexed", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/extract", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/sort ignore above", "Normalized keywords now use synthetic source") + task.skipTest("esql/81_text_exact_subfields/sort normalizer", "Normalized keywords now use synthetic source") }) tasks.named('yamlRestCompatTest').configure { From 10cf2d1fe8c633d04503e1b865b35a1ca45c19b4 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 5 Jun 2025 11:42:41 -0400 Subject: [PATCH 24/32] Update docs/changelog/126623.yaml --- docs/changelog/126623.yaml | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index d51b66b86078a..9acdb04b7da3d 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,24 +1,16 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: "breaking" +type: "breaking, enhancement" issues: - 124369 - 121358 breaking: title: Enable synthetic source on normalized keyword mappings area: Mapping - details: This changes the default behavior for Synthetic Source on - keyword fields using normalizers. Prior to this change, normalized - keywords were always stored to allow returning the non-normalized - values. Under this change, such field will NOT be stored (i.e - they will be synthesized from the index when returning source, - like all other synthetic source fields). This should result in - considerable space improvement for this use case. - - Users can opt out of this behavior on a per-field basis by setting - `synthetic_source_keep` to `all` on the field. - - impact: By default, normalized keyword fields in synthetic source - indices will no longer return the non-normalized value in the source. + details: |- + This changes the default behavior for Synthetic Source on keyword fields using normalizers. Prior to this change, normalized keywords were always stored to allow returning the non-normalized values. Under this change, such field will NOT be stored (i.e they will be synthesized from the index when returning source, like all other synthetic source fields). This should result in considerable space improvement for this use case. + Users can opt out of this behavior on a per-field basis by setting `synthetic_source_keep` to `all` on the field. + impact: "By default, normalized keyword fields in synthetic source indices will\ + \ no longer return the non-normalized value in the source." notable: false From 4ed5e3e8a184b3e31907f2f112ca63b74ad1a18a Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 5 Jun 2025 16:09:07 -0400 Subject: [PATCH 25/32] Update changelog --- docs/changelog/126623.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index 9acdb04b7da3d..8d22c180de71d 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,7 +1,7 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: "breaking, enhancement" +type: "breaking" issues: - 124369 - 121358 From fbe4adb6bf3b278b3c49b8ff1a40393eda8f6397 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Fri, 6 Jun 2025 09:32:40 -0400 Subject: [PATCH 26/32] Update docs/changelog/126623.yaml --- docs/changelog/126623.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index 8d22c180de71d..9acdb04b7da3d 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,7 +1,7 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: "breaking" +type: "breaking, enhancement" issues: - 124369 - 121358 From 0ca54e9ab03ee42cd86993ecaeda6946a17ba009 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Fri, 6 Jun 2025 09:44:00 -0400 Subject: [PATCH 27/32] fix changelog, again --- docs/changelog/126623.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index 9acdb04b7da3d..2138fc5a2e538 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,7 +1,7 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: "breaking, enhancement" +type: breaking issues: - 124369 - 121358 From 86e5b22d40f096c33754bdc29428e028d1c863cc Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Fri, 6 Jun 2025 16:44:44 -0400 Subject: [PATCH 28/32] response to PR feedback --- .../index/mapper/KeywordFieldMapper.java | 3 ++- .../matchers/source/FieldSpecificMatcher.java | 20 ++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 4c48b281a2da8..491783b40a004 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -1282,7 +1282,8 @@ private String originalName() { @Override protected SyntheticSourceSupport syntheticSourceSupport() { /* NOTE: we allow enabling synthetic source on Keyword fields with a Normalizer, even though the returned synthetic value - may not perfectly match the original, pre-normalization, value. + may not perfectly match the original, pre-normalization, value. This reduces the storage space required for normalized keyword + fields, which otherwise perform quite poorly in terms of storage. */ if (fieldType.stored() || hasDocValues) { diff --git a/test/framework/src/main/java/org/elasticsearch/datageneration/matchers/source/FieldSpecificMatcher.java b/test/framework/src/main/java/org/elasticsearch/datageneration/matchers/source/FieldSpecificMatcher.java index d3f205f02f79d..689bee2d166d0 100644 --- a/test/framework/src/main/java/org/elasticsearch/datageneration/matchers/source/FieldSpecificMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/datageneration/matchers/source/FieldSpecificMatcher.java @@ -331,7 +331,6 @@ Object convert(Object value, Object nullValue) { } class KeywordMatcher extends GenericMappingAwareMatcher { - String normalizer; KeywordMatcher( XContentBuilder actualMappings, @@ -349,21 +348,24 @@ public MatchResult match( Map actualMapping, Map expectedMapping ) { - this.normalizer = (String) FieldSpecificMatcher.getMappingParameter("normalizer", actualMapping, expectedMapping); + String normalizer = (String) FieldSpecificMatcher.getMappingParameter("normalizer", actualMapping, expectedMapping); + // Account for normalization + if (normalizer != null) { + if (normalizer.equals("lowercase") == false) { + throw new UnsupportedOperationException("Test framework currently only supports lowercase normalizer"); + } + expected = expected.stream() + .map(s -> s instanceof String ? ((String) s).toLowerCase(Locale.ROOT) : s) + .collect(Collectors.toList()); + } return super.match(actual, expected, actualMapping, expectedMapping); } @Override Object convert(Object value, Object nullValue) { if (value == null) { - // Normalization could also be applied to the null value - value = nullValue; - } - if (value instanceof String s && this.normalizer != null && this.normalizer.equals("lowercase")) { - // Currently, tests only support lowercase for normalization. - value = s.toLowerCase(Locale.ROOT); + return nullValue; } - return value; } } From 63bf7d16b989411c9dea490fba4c4e73d57ed42d Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 10 Jun 2025 17:02:26 -0400 Subject: [PATCH 29/32] Revert "Feedback from MVG on fields with both keyword and text subfields" This reverts commit aed9477e6540fbc78a3ece3a8a1444b3a3fa27f9. --- .../elasticsearch/index/mapper/TextFieldMapper.java | 11 +++++------ .../mapper/blockloader/TextFieldBlockLoaderTests.java | 3 ++- .../TextFieldWithParentBlockLoaderTests.java | 9 ++------- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 64cc2b7f77446..08cef586e1438 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -1008,21 +1008,17 @@ protected String delegatingTo() { } }; } - if (isStored()) { - return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name()); - } /* * If this is a sub-text field try and return the parent's loader. Text * fields will always be slow to load and if the parent is exact then we * should use that instead. */ - // TODO: should this be removed? I think SyntheticSourceHelper already does this: String parentField = blContext.parentField(name()); if (parentField != null) { MappedFieldType parent = blContext.lookup().fieldType(parentField); if (parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent; - if (kwd.hasDocValues() || kwd.isStored()) { + if (kwd.hasNormalizer() == false && (kwd.hasDocValues() || kwd.isStored())) { return new BlockLoader.Delegating(kwd.blockLoader(blContext)) { @Override protected String delegatingTo() { @@ -1032,6 +1028,9 @@ protected String delegatingTo() { } } } + if (isStored()) { + return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name()); + } // _ignored_source field will contain entries for this field if it is not stored // and there is no syntheticSourceDelegate. @@ -1580,7 +1579,7 @@ public static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterabl for (Mapper sub : multiFields) { if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { KeywordFieldMapper kwd = (KeywordFieldMapper) sub; - if (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored()) { + if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) { return kwd; } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java index 99b98bbda606a..77c42740451ee 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java @@ -39,13 +39,14 @@ public static Object expectedValue(Map fieldMapping, Object valu var fields = (Map) fieldMapping.get("fields"); if (fields != null) { var keywordMultiFieldMapping = (Map) fields.get("kwd"); + Object normalizer = fields.get("normalizer"); boolean docValues = hasDocValues(keywordMultiFieldMapping, true); boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true); Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above"); // See TextFieldMapper.SyntheticSourceHelper#getKeywordFieldMapperForSyntheticSource // and TextFieldMapper#canUseSyntheticSourceDelegateForLoading(). - boolean usingSyntheticSourceDelegate = docValues || store; + boolean usingSyntheticSourceDelegate = normalizer == null && (docValues || store); boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null; if (canUseSyntheticSourceDelegateForLoading) { return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java index df0dbe6a3b08f..b985b4583677b 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldWithParentBlockLoaderTests.java @@ -118,18 +118,13 @@ private Object expected(Map fieldMapping, Object value, BlockLoa boolean docValues = hasDocValues(fieldMapping, true); boolean store = fieldMapping.getOrDefault("store", false).equals(true); - // if text sub field is stored, then always use that: - var textFieldMapping = (Map) ((Map) fieldMapping.get("fields")).get("mf"); - if (textFieldMapping.getOrDefault("store", false).equals(true)) { - return TextFieldBlockLoaderTests.expectedValue(textFieldMapping, value, params, testContext); - } - - if (docValues || store) { + if (normalizer == null && (docValues || store)) { // we are using block loader of the parent field return KeywordFieldBlockLoaderTests.expectedValue(fieldMapping, value, params, testContext); } // we are using block loader of the text field itself + var textFieldMapping = (Map) ((Map) fieldMapping.get("fields")).get("mf"); return TextFieldBlockLoaderTests.expectedValue(textFieldMapping, value, params, testContext); } } From 43b107584d4bd305ab98b8e262bc0da39e7dacfd Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 10 Jun 2025 17:02:45 -0400 Subject: [PATCH 30/32] Revert "fix exact subfield tests" This reverts commit 270f066d49507cae1be2a31cd44fcd684828154b. --- .../test/esql/81_text_exact_subfields.yml | 101 ++++++------------ 1 file changed, 34 insertions(+), 67 deletions(-) diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml index f56a2ad1c4546..72e5f4728edc8 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/81_text_exact_subfields.yml @@ -1,7 +1,7 @@ --- setup: - requires: - cluster_features: [ "gte_v8.13.0" ] + cluster_features: ["gte_v8.13.0"] reason: "Exact subfields fixed in v 8.13" test_runner_features: allowed_warnings_regex - do: @@ -45,15 +45,12 @@ setup: refresh: true body: - { "index": { } } - - { "emp_no": 10, "text_ignore_above": "this is a long text", "text_normalizer": "CamelCase", "non_indexed": "foo" } + - { "emp_no": 10, "text_ignore_above":"this is a long text", "text_normalizer": "CamelCase", "non_indexed": "foo"} - { "index": { } } - - { "emp_no": 20, "text_ignore_above": "this", "text_normalizer": "abc", "non_indexed": "bar" } + - { "emp_no": 20, "text_ignore_above":"this", "text_normalizer": "abc", "non_indexed": "bar"} --- "extract": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -75,14 +72,11 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} --- "filter ignore above": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -104,7 +98,7 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} - do: allowed_warnings_regex: @@ -127,7 +121,7 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - do: @@ -197,13 +191,10 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } --- -"filter with normalizer 1": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" +"filter with normalizer": - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -225,19 +216,15 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + ---- -"filter with normalizer 2": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" esql.query: body: - query: 'from test | where text_normalizer == text_normalizer.raw | SORT emp_no | keep text_ignore_above, text_ignore_above.raw, text_normalizer, text_normalizer.raw, non_indexed, non_indexed.raw' + query: 'from test | where text_normalizer == text_normalizer.raw | keep text_ignore_above, text_ignore_above.raw, text_normalizer, text_normalizer.raw, non_indexed, non_indexed.raw' - match: { columns.0.name: "text_ignore_above" } - match: { columns.0.type: "text" } @@ -252,15 +239,11 @@ setup: - match: { columns.5.name: "non_indexed.raw" } - match: { columns.5.type: "keyword" } - - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - length: { values: 1 } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} --- "sort ignore above": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -282,8 +265,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - do: allowed_warnings_regex: @@ -306,8 +289,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} - do: allowed_warnings_regex: @@ -330,8 +313,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - do: allowed_warnings_regex: @@ -354,14 +337,11 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } --- -"sort normalizer 1": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" +"sort normalizer": - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -383,14 +363,9 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} ---- -"sort normalizer 2": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -412,14 +387,9 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } ---- -"sort normalizer 3": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -441,15 +411,12 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } --- "non indexed": - - requires: - cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source" - reason: "Behavior changed in #126623" - do: allowed_warnings_regex: - "No limit defined, adding default limit of \\[.*\\]" @@ -471,8 +438,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ] } - - match: { values.1: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this", "this", "abc", "abc", "bar", "bar" ]} + - match: { values.1: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } - do: allowed_warnings_regex: @@ -495,8 +462,8 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 2 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } - - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } + - match: { values.1: [ "this", "this", "abc", "abc", "bar", "bar" ]} - do: allowed_warnings_regex: @@ -519,4 +486,4 @@ setup: - match: { columns.5.type: "keyword" } - length: { values: 1 } - - match: { values.0: [ "this is a long text", null, "camelcase", "camelcase", "foo", "foo" ] } + - match: { values.0: [ "this is a long text", null, "CamelCase", "camelcase", "foo", "foo"] } From 2b9d384039bce9fced20db1df3ad1c9327980331 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 11 Jun 2025 12:47:41 -0400 Subject: [PATCH 31/32] leaving a note for future investigation --- .../org/elasticsearch/index/mapper/TextFieldMapper.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 08cef586e1438..9935bc0f19072 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -300,6 +300,12 @@ public Builder(String name, IndexVersion indexCreatedVersion, IndexAnalyzers ind // storing the field without requiring users to explicitly set 'store'. // // If 'store' parameter was explicitly provided we'll reject the request. + + /* NOTE: I am fairly sure the above is strictly not true. Testing seems + show that we do not reject the mapping, even when there is no compatible keyword + field and the mapping sets store to false. --MT 2025-06-11 + */ + this.store = Parameter.storeParam( m -> ((TextFieldMapper) m).store, () -> isSyntheticSourceEnabled && multiFieldsBuilder.hasSyntheticSourceCompatibleKeywordField() == false From 3b33b1545ddea14c8464828e11a9c3c63fe5251a Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 11 Jun 2025 13:07:26 -0400 Subject: [PATCH 32/32] Update docs/changelog/126623.yaml --- docs/changelog/126623.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml index 2138fc5a2e538..9acdb04b7da3d 100644 --- a/docs/changelog/126623.yaml +++ b/docs/changelog/126623.yaml @@ -1,7 +1,7 @@ pr: 126623 summary: Enable synthetic source on normalized keyword mappings area: Mapping -type: breaking +type: "breaking, enhancement" issues: - 124369 - 121358