experiment with XContentParser#XContentParser()

martijnvg · martijnvg · commit 9c04c4f27700 · 2025-02-28T18:47:38.000+01:00
diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@@ -28,6 +28,7 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
 import org.elasticsearch.common.lucene.Lucene;
@@ -70,6 +71,8 @@
 import java.util.Objects;
 import java.util.Set;
 
+import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull;
+
 /**
  * A {@link FieldMapper} for full-text fields that only indexes
  * {@link IndexOptions#DOCS} and runs positional queries by looking at the
@@ -438,18 +441,19 @@ public FieldMapper.Builder getMergeBuilder() {
 
     @Override
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
+        final CharsRef value = parseTextOrNull(context.parser());
 
         if (value == null) {
             return;
         }
 
-        Field field = new Field(fieldType().name(), value, fieldType);
+        BytesRef copy = new BytesRef(value);
+        Field field = new Field(fieldType().name(), copy, fieldType);
         context.doc().add(field);
         context.addToFieldNames(fieldType().name());
 
         if (storeSource) {
-            context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), value));
+            context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), copy));
         }
     }
 
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -69,6 +70,7 @@
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
 
+import java.io.CharArrayReader;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
@@ -891,7 +893,7 @@ private String applyIgnoreAboveAndNormalizer(String value) {
                 return null;
             }
 
-            return normalizeValue(normalizer(), name(), value);
+            return normalizeValue(normalizer(), name(), new CharsRef(value)).toString();
         }
 
         @Override
@@ -1104,9 +1106,10 @@ public String getOffsetFieldName() {
     }
 
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        String value = context.parser().textOrNull();
+        CharsRef value = parseTextOrNull(context.parser());
         if (value == null) {
-            value = fieldType().nullValue;
+            // TODO: fix conversion
+            value = new CharsRef(fieldType().nullValue);
         }
 
         boolean indexed = indexValue(context, value);
@@ -1119,17 +1122,40 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
         }
     }
 
+    /**
+     * Parses values without making a copy, like when {@link XContentParser#textOrNull()} gets invoked.
+     *
+     * Typically, two copies are made, first time when {@link XContentParser#textOrNull()} is invoked,
+     * then second time when we convert to {@link BytesRef}.
+     */
+    public static CharsRef parseTextOrNull(XContentParser parser) throws IOException {
+        var currentToken = parser.currentToken();
+        if (currentToken == XContentParser.Token.VALUE_NULL) {
+            return null;
+        } else if (currentToken.isValue()) {
+            return new CharsRef(parser.textCharacters(), parser.textOffset(), parser.textLength());
+        } else {
+            assert false : "unexpected token [" + currentToken + "]";
+            return null;
+        }
+    }
+
     @Override
     protected void indexScriptValues(
         SearchLookup searchLookup,
         LeafReaderContext readerContext,
         int doc,
         DocumentParserContext documentParserContext
     ) {
-        this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value));
+        this.fieldType().scriptValues.valuesForDoc(
+            searchLookup,
+            readerContext,
+            doc,
+            value -> indexValue(documentParserContext, new CharsRef(value))
+        );
     }
 
-    private boolean indexValue(DocumentParserContext context, String value) {
+    private boolean indexValue(DocumentParserContext context, CharsRef value) {
         if (value == null) {
             return false;
         }
@@ -1186,11 +1212,11 @@ private boolean indexValue(DocumentParserContext context, String value) {
         return true;
     }
 
-    private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
+    private static CharsRef normalizeValue(NamedAnalyzer normalizer, String field, CharsRef value) {
         if (normalizer == Lucene.KEYWORD_ANALYZER) {
             return value;
         }
-        try (TokenStream ts = normalizer.tokenStream(field, value)) {
+        try (TokenStream ts = normalizer.tokenStream(field, new CharArrayReader(value.chars, value.offset, value.length))) {
             final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
             ts.reset();
             if (ts.incrementToken() == false) {
@@ -1199,7 +1225,7 @@ private static String normalizeValue(NamedAnalyzer normalizer, String field, Str
                     but got 0 for analyzer %s and input "%s"
                     """, normalizer, value));
             }
-            final String newValue = termAtt.toString();
+            final CharsRef newValue = new CharsRef(termAtt.buffer(), 0, termAtt.length());
             if (ts.incrementToken()) {
                 throw new IllegalStateException(String.format(Locale.ROOT, """
                     The normalization token stream is expected to produce exactly 1 token, \
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java
@@ -85,6 +85,7 @@
 import java.util.Objects;
 import java.util.function.IntPredicate;
 
+import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull;
 import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
 
 /** A {@link FieldMapper} for full-text fields. */
@@ -1296,23 +1297,24 @@ public FieldMapper.Builder getMergeBuilder() {
 
     @Override
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
+        final var value = parseTextOrNull(context.parser());
 
         if (value == null) {
             return;
         }
 
         if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
-            Field field = new Field(fieldType().name(), value, fieldType);
+            BytesRef copy = new BytesRef(value);
+            Field field = new Field(fieldType().name(), copy, fieldType);
             context.doc().add(field);
             if (fieldType.omitNorms()) {
                 context.addToFieldNames(fieldType().name());
             }
             if (prefixFieldInfo != null) {
-                context.doc().add(new Field(prefixFieldInfo.field, value, prefixFieldInfo.fieldType));
+                context.doc().add(new Field(prefixFieldInfo.field, copy, prefixFieldInfo.fieldType));
             }
             if (phraseFieldInfo != null) {
-                context.doc().add(new Field(phraseFieldInfo.field, value, phraseFieldInfo.fieldType));
+                context.doc().add(new Field(phraseFieldInfo.field, copy, phraseFieldInfo.fieldType));
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,7 @@`
`85`	`85`	`import java.util.Objects;`
`86`	`86`	`import java.util.function.IntPredicate;`
`87`	`87`
	`88`	`+import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull;`
`88`	`89`	`import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;`
`89`	`90`
`90`	`91`	`/** A {@link FieldMapper} for full-text fields. */`
`@@ -1296,23 +1297,24 @@ public FieldMapper.Builder getMergeBuilder() {`
`1296`	`1297`
`1297`	`1298`	`@Override`
`1298`	`1299`	`protected void parseCreateField(DocumentParserContext context) throws IOException {`
`1299`		`- final String value = context.parser().textOrNull();`
	`1300`	`+ final var value = parseTextOrNull(context.parser());`
`1300`	`1301`
`1301`	`1302`	`if (value == null) {`
`1302`	`1303`	`return;`
`1303`	`1304`	`}`
`1304`	`1305`
`1305`	`1306`	`if (fieldType.indexOptions() != IndexOptions.NONE \|\| fieldType.stored()) {`
`1306`		`- Field field = new Field(fieldType().name(), value, fieldType);`
	`1307`	`+ BytesRef copy = new BytesRef(value);`
	`1308`	`+ Field field = new Field(fieldType().name(), copy, fieldType);`
`1307`	`1309`	`context.doc().add(field);`
`1308`	`1310`	`if (fieldType.omitNorms()) {`
`1309`	`1311`	`context.addToFieldNames(fieldType().name());`
`1310`	`1312`	`}`
`1311`	`1313`	`if (prefixFieldInfo != null) {`
`1312`		`- context.doc().add(new Field(prefixFieldInfo.field, value, prefixFieldInfo.fieldType));`
	`1314`	`+ context.doc().add(new Field(prefixFieldInfo.field, copy, prefixFieldInfo.fieldType));`
`1313`	`1315`	`}`
`1314`	`1316`	`if (phraseFieldInfo != null) {`
`1315`		`- context.doc().add(new Field(phraseFieldInfo.field, value, phraseFieldInfo.fieldType));`
	`1317`	`+ context.doc().add(new Field(phraseFieldInfo.field, copy, phraseFieldInfo.fieldType));`
`1316`	`1318`	`}`
`1317`	`1319`	`}`
`1318`	`1320`	`}`