elastic · martijnvg · Feb 28, 2025
diff --git a/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@@ -28,6 +28,7 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
 import org.elasticsearch.common.lucene.Lucene;
@@ -70,6 +71,8 @@
 import java.util.Objects;
 import java.util.Set;
 
+import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull;
+
 /**
  * A {@link FieldMapper} for full-text fields that only indexes
  * {@link IndexOptions#DOCS} and runs positional queries by looking at the
@@ -438,18 +441,19 @@ public FieldMapper.Builder getMergeBuilder() {
 
     @Override
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
+        final CharsRef value = parseTextOrNull(context.parser());
 
         if (value == null) {
             return;
         }
 
-        Field field = new Field(fieldType().name(), value, fieldType);
+        BytesRef copy = new BytesRef(value);
+        Field field = new Field(fieldType().name(), copy, fieldType);
         context.doc().add(field);
         context.addToFieldNames(fieldType().name());
 
         if (storeSource) {
-            context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), value));
+            context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), copy));
         }
     }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -69,6 +70,7 @@
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
 
+import java.io.CharArrayReader;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
@@ -891,7 +893,7 @@ private String applyIgnoreAboveAndNormalizer(String value) {
                 return null;
             }
 
-            return normalizeValue(normalizer(), name(), value);
+            return normalizeValue(normalizer(), name(), new CharsRef(value)).toString();
         }
 
         @Override
@@ -1104,9 +1106,10 @@ public String getOffsetFieldName() {
     }
 
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        String value = context.parser().textOrNull();
+        CharsRef value = parseTextOrNull(context.parser());
         if (value == null) {
-            value = fieldType().nullValue;
+            // TODO: fix conversion
+            value = new CharsRef(fieldType().nullValue);
         }
 
         boolean indexed = indexValue(context, value);
@@ -1119,17 +1122,40 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
         }
     }
 
+    /**
+     * Parses values without making a copy, like when {@link XContentParser#textOrNull()} gets invoked.
+     *
+     * Typically, two copies are made, first time when {@link XContentParser#textOrNull()} is invoked,
+     * then second time when we convert to {@link BytesRef}.
+     */
+    public static CharsRef parseTextOrNull(XContentParser parser) throws IOException {
+        var currentToken = parser.currentToken();
+        if (currentToken == XContentParser.Token.VALUE_NULL) {
+            return null;
+        } else if (currentToken.isValue()) {
+            return new CharsRef(parser.textCharacters(), parser.textOffset(), parser.textLength());
+        } else {
+            assert false : "unexpected token [" + currentToken + "]";
+            return null;
+        }
+    }
+
     @Override
     protected void indexScriptValues(
         SearchLookup searchLookup,
         LeafReaderContext readerContext,
         int doc,
         DocumentParserContext documentParserContext
     ) {
-        this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value));
+        this.fieldType().scriptValues.valuesForDoc(
+            searchLookup,
+            readerContext,
+            doc,
+            value -> indexValue(documentParserContext, new CharsRef(value))
+        );
     }
 
-    private boolean indexValue(DocumentParserContext context, String value) {
+    private boolean indexValue(DocumentParserContext context, CharsRef value) {
         if (value == null) {
             return false;
         }
@@ -1186,11 +1212,11 @@ private boolean indexValue(DocumentParserContext context, String value) {
         return true;
     }
 
-    private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
+    private static CharsRef normalizeValue(NamedAnalyzer normalizer, String field, CharsRef value) {
         if (normalizer == Lucene.KEYWORD_ANALYZER) {
             return value;
         }
-        try (TokenStream ts = normalizer.tokenStream(field, value)) {
+        try (TokenStream ts = normalizer.tokenStream(field, new CharArrayReader(value.chars, value.offset, value.length))) {
             final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
             ts.reset();
             if (ts.incrementToken() == false) {
@@ -1199,7 +1225,7 @@ private static String normalizeValue(NamedAnalyzer normalizer, String field, Str
                     but got 0 for analyzer %s and input "%s"
                     """, normalizer, value));
             }
-            final String newValue = termAtt.toString();
+            final CharsRef newValue = new CharsRef(termAtt.buffer(), 0, termAtt.length());
             if (ts.incrementToken()) {
                 throw new IllegalStateException(String.format(Locale.ROOT, """
                     The normalization token stream is expected to produce exactly 1 token, \

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java
@@ -85,6 +85,7 @@
 import java.util.Objects;
 import java.util.function.IntPredicate;
 
+import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull;
 import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
 
 /** A {@link FieldMapper} for full-text fields. */
@@ -1296,23 +1297,24 @@ public FieldMapper.Builder getMergeBuilder() {
 
     @Override
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
+        final var value = parseTextOrNull(context.parser());
 
         if (value == null) {
             return;
         }
 
         if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
-            Field field = new Field(fieldType().name(), value, fieldType);
+            BytesRef copy = new BytesRef(value);
+            Field field = new Field(fieldType().name(), copy, fieldType);
             context.doc().add(field);
             if (fieldType.omitNorms()) {
                 context.addToFieldNames(fieldType().name());
             }
             if (prefixFieldInfo != null) {
-                context.doc().add(new Field(prefixFieldInfo.field, value, prefixFieldInfo.fieldType));
+                context.doc().add(new Field(prefixFieldInfo.field, copy, prefixFieldInfo.fieldType));
             }
             if (phraseFieldInfo != null) {
-                context.doc().add(new Field(phraseFieldInfo.field, value, phraseFieldInfo.fieldType));
+                context.doc().add(new Field(phraseFieldInfo.field, copy, phraseFieldInfo.fieldType));
             }
         }
     }