From 9c04c4f277006493a8bec49c0ff7e60fc18eaf32 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 28 Feb 2025 18:47:38 +0100 Subject: [PATCH] experiment with XContentParser#XContentParser() --- .../extras/MatchOnlyTextFieldMapper.java | 10 +++-- .../index/mapper/KeywordFieldMapper.java | 42 +++++++++++++++---- .../index/mapper/TextFieldMapper.java | 10 +++-- 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java index 055f6091ac484..a1fd51852fb6d 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java @@ -28,6 +28,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IOFunction; import org.elasticsearch.common.CheckedIntFunction; import org.elasticsearch.common.lucene.Lucene; @@ -70,6 +71,8 @@ import java.util.Objects; import java.util.Set; +import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull; + /** * A {@link FieldMapper} for full-text fields that only indexes * {@link IndexOptions#DOCS} and runs positional queries by looking at the @@ -438,18 +441,19 @@ public FieldMapper.Builder getMergeBuilder() { @Override protected void parseCreateField(DocumentParserContext context) throws IOException { - final String value = context.parser().textOrNull(); + final CharsRef value = parseTextOrNull(context.parser()); if (value == null) { return; } - Field field = new Field(fieldType().name(), value, fieldType); + BytesRef copy = new BytesRef(value); + Field field = new Field(fieldType().name(), copy, fieldType); context.doc().add(field); context.addToFieldNames(fieldType().name()); if (storeSource) { - context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), value)); + context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), copy)); } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 127024f17a222..f0e2c98c9169b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -29,6 +29,7 @@ import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; @@ -69,6 +70,7 @@ import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; +import java.io.CharArrayReader; import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayList; @@ -891,7 +893,7 @@ private String applyIgnoreAboveAndNormalizer(String value) { return null; } - return normalizeValue(normalizer(), name(), value); + return normalizeValue(normalizer(), name(), new CharsRef(value)).toString(); } @Override @@ -1104,9 +1106,10 @@ public String getOffsetFieldName() { } protected void parseCreateField(DocumentParserContext context) throws IOException { - String value = context.parser().textOrNull(); + CharsRef value = parseTextOrNull(context.parser()); if (value == null) { - value = fieldType().nullValue; + // TODO: fix conversion + value = new CharsRef(fieldType().nullValue); } boolean indexed = indexValue(context, value); @@ -1119,6 +1122,24 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio } } + /** + * Parses values without making a copy, like when {@link XContentParser#textOrNull()} gets invoked. + * + * Typically, two copies are made, first time when {@link XContentParser#textOrNull()} is invoked, + * then second time when we convert to {@link BytesRef}. + */ + public static CharsRef parseTextOrNull(XContentParser parser) throws IOException { + var currentToken = parser.currentToken(); + if (currentToken == XContentParser.Token.VALUE_NULL) { + return null; + } else if (currentToken.isValue()) { + return new CharsRef(parser.textCharacters(), parser.textOffset(), parser.textLength()); + } else { + assert false : "unexpected token [" + currentToken + "]"; + return null; + } + } + @Override protected void indexScriptValues( SearchLookup searchLookup, @@ -1126,10 +1147,15 @@ protected void indexScriptValues( int doc, DocumentParserContext documentParserContext ) { - this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value)); + this.fieldType().scriptValues.valuesForDoc( + searchLookup, + readerContext, + doc, + value -> indexValue(documentParserContext, new CharsRef(value)) + ); } - private boolean indexValue(DocumentParserContext context, String value) { + private boolean indexValue(DocumentParserContext context, CharsRef value) { if (value == null) { return false; } @@ -1186,11 +1212,11 @@ private boolean indexValue(DocumentParserContext context, String value) { return true; } - private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) { + private static CharsRef normalizeValue(NamedAnalyzer normalizer, String field, CharsRef value) { if (normalizer == Lucene.KEYWORD_ANALYZER) { return value; } - try (TokenStream ts = normalizer.tokenStream(field, value)) { + try (TokenStream ts = normalizer.tokenStream(field, new CharArrayReader(value.chars, value.offset, value.length))) { final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); if (ts.incrementToken() == false) { @@ -1199,7 +1225,7 @@ private static String normalizeValue(NamedAnalyzer normalizer, String field, Str but got 0 for analyzer %s and input "%s" """, normalizer, value)); } - final String newValue = termAtt.toString(); + final CharsRef newValue = new CharsRef(termAtt.buffer(), 0, termAtt.length()); if (ts.incrementToken()) { throw new IllegalStateException(String.format(Locale.ROOT, """ The normalization token stream is expected to produce exactly 1 token, \ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 01b275e0a382e..f6fbf567635c3 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -85,6 +85,7 @@ import java.util.Objects; import java.util.function.IntPredicate; +import static org.elasticsearch.index.mapper.KeywordFieldMapper.parseTextOrNull; import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES; /** A {@link FieldMapper} for full-text fields. */ @@ -1296,23 +1297,24 @@ public FieldMapper.Builder getMergeBuilder() { @Override protected void parseCreateField(DocumentParserContext context) throws IOException { - final String value = context.parser().textOrNull(); + final var value = parseTextOrNull(context.parser()); if (value == null) { return; } if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) { - Field field = new Field(fieldType().name(), value, fieldType); + BytesRef copy = new BytesRef(value); + Field field = new Field(fieldType().name(), copy, fieldType); context.doc().add(field); if (fieldType.omitNorms()) { context.addToFieldNames(fieldType().name()); } if (prefixFieldInfo != null) { - context.doc().add(new Field(prefixFieldInfo.field, value, prefixFieldInfo.fieldType)); + context.doc().add(new Field(prefixFieldInfo.field, copy, prefixFieldInfo.fieldType)); } if (phraseFieldInfo != null) { - context.doc().add(new Field(phraseFieldInfo.field, value, phraseFieldInfo.fieldType)); + context.doc().add(new Field(phraseFieldInfo.field, copy, phraseFieldInfo.fieldType)); } } }