elastic · not-napoleon · Apr 7, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
diff --git a/docs/changelog/126623.yaml b/docs/changelog/126623.yaml
@@ -0,0 +1,16 @@
+pr: 126623
+summary: Enable synthetic source on normalized keyword mappings
+area: Mapping
+type: breaking
+issues:
+ - 124369
+ - 121358
+breaking:
+  title: Enable synthetic source on normalized keyword mappings
+  area: Mapping
+  details: |-
+    This changes the default behavior for Synthetic Source on keyword fields using normalizers. Prior to this change, normalized keywords were always stored to allow returning the non-normalized values. Under this change, such field will NOT be stored (i.e they will be synthesized from the index when returning source, like all other synthetic source fields).  This should result in considerable space improvement for this use case.
+    Users can opt out of this behavior on a per-field basis by setting `synthetic_source_keep` to `all` on the field.
+  impact: "By default, normalized keyword fields in synthetic source indices will\
+    \ no longer return the non-normalized value in the source."
+  notable: false
diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle
@@ -89,4 +89,5 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task ->
   task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively")
   task.skipTest("cluster.info/30_info_thread_pool/Cluster HTTP Info", "The search_throttled thread pool has been removed")
   task.skipTest("synonyms/80_synonyms_from_index/Fail loading synonyms from index if synonyms_set doesn't exist", "Synonyms do no longer fail if the synonyms_set doesn't exist")
+  task.skipTest("mget/90_synthetic_source/keyword with normalizer", "Normalized keywords now use synthetic source")
 })
diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml
@@ -6,7 +6,7 @@ setup:
 ---
 keyword:
   - requires:
-      cluster_features: ["gte_v8.4.0"]
+      cluster_features: [ "gte_v8.4.0" ]
       reason: introduced in 8.4.0
 
   - do:
@@ -23,15 +23,15 @@ keyword:
 
   - do:
       index:
-        index:   test
-        id:      1
+        index: test
+        id: 1
         body:
           kwd: foo
 
   - do:
       index:
-        index:   test
-        id:      2
+        index: test
+        id: 2
         body:
           kwd: bar
 
@@ -40,21 +40,24 @@ keyword:
       mget:
         index: test
         body:
-          ids:    [1, 2]
-  - match: {docs.0._index: "test"}
-  - match: {docs.0._id: "1"}
+          ids: [ 1, 2 ]
+  - match: { docs.0._index: "test" }
+  - match: { docs.0._id: "1" }
   - match:
       docs.0._source:
         kwd: foo
 
-  - match: {docs.1._index: "test"}
-  - match: {docs.1._id: "2"}
+  - match: { docs.1._index: "test" }
+  - match: { docs.1._id: "2" }
   - match:
       docs.1._source:
         kwd: bar
 
 ---
 keyword with normalizer:
+  - requires:
+      cluster_features: "mapper.keyword.keyword_normalizer_synthetic_source"
+      reason: "Behavior changed in #126623"
   - do:
       indices.create:
         index: test-keyword-with-normalizer
@@ -113,7 +116,94 @@ keyword with normalizer:
       mget:
         index: test-keyword-with-normalizer
         body:
-          ids:    [ 1, 2, 3 ]
+          ids: [ 1, 2, 3 ]
+  - match: { docs.0._index: "test-keyword-with-normalizer" }
+  - match: { docs.0._id: "1" }
+  - match:
+      docs.0._source:
+        keyword: "the quick brown fox jumps over the lazy dog"
+        keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
+        keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"
+
+  - match: { docs.1._index: "test-keyword-with-normalizer" }
+  - match: { docs.1._id: "2" }
+  - match:
+      docs.1._source:
+        keyword: "the five boxing wizards jump quickly"
+        keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
+        keyword_without_doc_values: "The five BOXING wizards jump Quickly"
+
+  - match: { docs.2._index: "test-keyword-with-normalizer" }
+  - match: { docs.2._id: "3" }
+  - match:
+      docs.2._source:
+        keyword: [ "do or do not, there is no try", "may the force be with you!" ]
+        keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
+        keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
+
+---
+keyword with normalizer, source keep mode all:
+  - do:
+      indices.create:
+        index: test-keyword-with-normalizer
+        body:
+          settings:
+            analysis:
+              normalizer:
+                lowercase:
+                  type: custom
+                  filter:
+                    - lowercase
+            index:
+              mapping.source.mode: synthetic
+
+          mappings:
+            properties:
+              keyword:
+                type: keyword
+                normalizer: lowercase
+                synthetic_source_keep: all
+              keyword_with_ignore_above:
+                type: keyword
+                normalizer: lowercase
+                ignore_above: 10
+              keyword_without_doc_values:
+                type: keyword
+                normalizer: lowercase
+                doc_values: false
+
+  - do:
+      index:
+        index: test-keyword-with-normalizer
+        id: 1
+        body:
+          keyword: "the Quick Brown Fox jumps over the lazy Dog"
+          keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
+          keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"
+
+  - do:
+      index:
+        index: test-keyword-with-normalizer
+        id: 2
+        body:
+          keyword: "The five BOXING wizards jump Quickly"
+          keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
+          keyword_without_doc_values: "The five BOXING wizards jump Quickly"
+
+  - do:
+      index:
+        index: test-keyword-with-normalizer
+        id: 3
+        body:
+          keyword: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
+          keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
+          keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
+
+  - do:
+      mget:
+        index: test-keyword-with-normalizer
+        body:
+          ids: [ 1, 2, 3 ]
   - match: { docs.0._index: "test-keyword-with-normalizer" }
   - match: { docs.0._id: "1" }
   - match:
@@ -141,7 +231,7 @@ keyword with normalizer:
 ---
 stored text:
   - requires:
-      cluster_features: ["gte_v8.5.0"]
+      cluster_features: [ "gte_v8.5.0" ]
       reason: introduced in 8.5.0
 
   - do:
@@ -159,39 +249,39 @@ stored text:
 
   - do:
       index:
-        index:   test
-        id:      1
+        index: test
+        id: 1
         body:
           text: the quick brown fox
 
   - do:
       index:
-        index:   test
-        id:      2
+        index: test
+        id: 2
         body:
           text: jumped over the lazy dog
 
   - do:
       mget:
         index: test
         body:
-          ids:    [1, 2]
-  - match: {docs.0._index: "test"}
-  - match: {docs.0._id: "1"}
+          ids: [ 1, 2 ]
+  - match: { docs.0._index: "test" }
+  - match: { docs.0._id: "1" }
   - match:
       docs.0._source:
         text: the quick brown fox
 
-  - match: {docs.1._index: "test"}
-  - match: {docs.1._id: "2"}
+  - match: { docs.1._index: "test" }
+  - match: { docs.1._id: "2" }
   - match:
       docs.1._source:
         text: jumped over the lazy dog
 
 ---
 force_synthetic_source_ok:
   - requires:
-      cluster_features: ["gte_v8.4.0"]
+      cluster_features: [ "gte_v8.4.0" ]
       reason: introduced in 8.4.0
 
   - do:
@@ -210,15 +300,15 @@ force_synthetic_source_ok:
 
   - do:
       index:
-        index:   test
-        id:      1
+        index: test
+        id: 1
         body:
           obj.kwd: foo
 
   - do:
       index:
-        index:   test
-        id:      2
+        index: test
+        id: 2
         body:
           obj:
             kwd: bar
@@ -228,7 +318,7 @@ force_synthetic_source_ok:
       mget:
         index: test
         body:
-          ids: [1, 2]
+          ids: [ 1, 2 ]
   - match:
       docs.0._source:
         obj.kwd: foo

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -39,6 +39,7 @@
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.core.Nullable;
+import org.elasticsearch.features.NodeFeature;
 import org.elasticsearch.index.IndexMode;
 import org.elasticsearch.index.IndexSortConfig;
 import org.elasticsearch.index.IndexVersion;
@@ -94,6 +95,10 @@
  */
 public final class KeywordFieldMapper extends FieldMapper {
 
+    public static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature(
+        "mapper.keyword.keyword_normalizer_synthetic_source"
+    );
+
     private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class);
 
     public static final String CONTENT_TYPE = "keyword";
@@ -1276,11 +1281,9 @@ private String originalName() {
 
     @Override
     protected SyntheticSourceSupport syntheticSourceSupport() {
-        if (hasNormalizer()) {
-            // NOTE: no matter if we have doc values or not we use fallback synthetic source
-            // to store the original value whose doc values would be altered by the normalizer
-            return SyntheticSourceSupport.FALLBACK;
-        }
+        /* NOTE: we allow enabling synthetic source on Keyword fields with a Normalizer, even though the returned synthetic value
+        may not perfectly match the original, pre-normalization, value.
+         */
 
         if (fieldType.stored() || hasDocValues) {
             return new SyntheticSourceSupport.Native(() -> syntheticFieldLoader(fullPath(), leafName()));

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java
@@ -70,7 +70,8 @@ public Set<NodeFeature> getTestFeatures() {
             NPE_ON_DIMS_UPDATE_FIX,
             RESCORE_ZERO_VECTOR_QUANTIZED_VECTOR_MAPPING,
             USE_DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ,
-            IVF_FORMAT_CLUSTER_FEATURE
+            IVF_FORMAT_CLUSTER_FEATURE,
+            KeywordFieldMapper.KEYWORD_NORMALIZER_SYNTHETIC_SOURCE
         );
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java
@@ -1008,17 +1008,21 @@ protected String delegatingTo() {
                     }
                 };
             }
+            if (isStored()) {
+                return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name());
+            }
             /*
              * If this is a sub-text field try and return the parent's loader. Text
              * fields will always be slow to load and if the parent is exact then we
              * should use that instead.
              */
+            // TODO: should this be removed? I think SyntheticSourceHelper already does this:
             String parentField = blContext.parentField(name());
             if (parentField != null) {
                 MappedFieldType parent = blContext.lookup().fieldType(parentField);
                 if (parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) {
                     KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent;
-                    if (kwd.hasNormalizer() == false && (kwd.hasDocValues() || kwd.isStored())) {
+                    if (kwd.hasDocValues() || kwd.isStored()) {
                         return new BlockLoader.Delegating(kwd.blockLoader(blContext)) {
                             @Override
                             protected String delegatingTo() {
@@ -1028,9 +1032,6 @@ protected String delegatingTo() {
                     }
                 }
             }
-            if (isStored()) {
-                return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name());
-            }
 
             // _ignored_source field will contain entries for this field if it is not stored
             // and there is no syntheticSourceDelegate.
@@ -1579,7 +1580,7 @@ public static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterabl
             for (Mapper sub : multiFields) {
                 if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) {
                     KeywordFieldMapper kwd = (KeywordFieldMapper) sub;
-                    if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) {
+                    if (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored()) {
                         return kwd;
                     }
                 }