Skip to content
1 change: 1 addition & 0 deletions benchmarks/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ dependencies {
}
api(project(':libs:h3'))
api(project(':modules:aggregations'))
implementation project(':modules:mapper-extras');
api(project(':x-pack:plugin:esql-core'))
api(project(':x-pack:plugin:core'))
api(project(':x-pack:plugin:esql'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.elasticsearch.index.mapper.ProvidedIdFieldMapper;
import org.elasticsearch.index.similarity.SimilarityService;
import org.elasticsearch.indices.IndicesModule;
import org.elasticsearch.plugins.MapperPlugin;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptCompiler;
import org.elasticsearch.script.ScriptContext;
Expand All @@ -38,11 +39,16 @@
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;

public class MapperServiceFactory {

public static MapperService create(String mappings) {
return create(mappings, Collections.emptyList());
}

public static MapperService create(String mappings, List<MapperPlugin> mapperPlugins) {
Settings settings = Settings.builder()
.put("index.number_of_replicas", 0)
.put("index.number_of_shards", 1)
Expand All @@ -51,7 +57,7 @@ public static MapperService create(String mappings) {
.build();
IndexMetadata meta = IndexMetadata.builder("index").settings(settings).build();
IndexSettings indexSettings = new IndexSettings(meta, settings);
MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry();
MapperRegistry mapperRegistry = new IndicesModule(mapperPlugins).getMapperRegistry();

SimilarityService similarityService = new SimilarityService(indexSettings, null, Map.of());
BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(indexSettings, BitsetFilterCache.Listener.NOOP);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.elasticsearch.common.logging.LogConfigurator;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.SourceToParse;
import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentFactory;
import org.elasticsearch.xcontent.XContentType;
Expand All @@ -34,6 +35,7 @@
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;

Expand Down Expand Up @@ -66,7 +68,7 @@ public class OptimizedTextBenchmark {
private SourceToParse[] sources;

private String randomValue(int length) {
final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
Random random = new Random();
StringBuilder builder = new StringBuilder(length);
for (int i = 0; i < length; i++) {
Expand All @@ -83,17 +85,17 @@ public void setup() throws IOException {
"dynamic": false,
"properties": {
"field": {
"type": "keyword"
"type": "match_only_text"
}
}
}
}
""");
""", List.of(new MapperExtrasPlugin()));

sources = new SourceToParse[nDocs];
for (int i = 0; i < nDocs; i++) {
XContentBuilder b = XContentFactory.jsonBuilder();
b.startObject().field("field", randomValue(8)).endObject();
b.startObject().field("field", randomValue(512)).endObject();
sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.util.IOFunction;
import org.elasticsearch.common.CheckedIntFunction;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.text.UTF8DecodingReader;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
Expand Down Expand Up @@ -384,7 +385,7 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext
) {
@Override
protected BytesRef storedToBytesRef(Object stored) {
return new BytesRef((String) stored);
return (BytesRef) stored;
}
};
}
Expand Down Expand Up @@ -442,18 +443,20 @@ public FieldMapper.Builder getMergeBuilder() {

@Override
protected void parseCreateField(DocumentParserContext context) throws IOException {
final String value = context.parser().textOrNull();
final var value = context.parser().optimizedTextOrNull();

if (value == null) {
return;
}

Field field = new Field(fieldType().name(), value, fieldType);
final var utfBytes = value.bytes();
Field field = new Field(fieldType().name(), new UTF8DecodingReader(utfBytes), fieldType);
context.doc().add(field);
context.addToFieldNames(fieldType().name());

if (storeSource) {
context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), value));
final var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), bytesRef));
}
}

Expand All @@ -473,7 +476,7 @@ protected SyntheticSourceSupport syntheticSourceSupport() {
() -> new StringStoredFieldFieldLoader(fieldType().storedFieldNameForSyntheticSource(), fieldType().name(), leafName()) {
@Override
protected void write(XContentBuilder b, Object value) throws IOException {
b.value((String) value);
b.value(((BytesRef) value).utf8ToString());
}
}
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOFunction;
import org.elasticsearch.common.CheckedIntFunction;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
Expand Down Expand Up @@ -438,7 +439,13 @@ private MemoryIndex getOrCreateMemoryIndex() throws IOException {
if (value == null) {
continue;
}
cacheEntry.memoryIndex.addField(field, value.toString(), indexAnalyzer);
String valueStr;
if (value instanceof BytesRef valueRef) {
valueStr = valueRef.utf8ToString();
} else {
valueStr = value.toString();
}
cacheEntry.memoryIndex.addField(field, valueStr, indexAnalyzer);
}
}
return cacheEntry.memoryIndex;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,12 @@ public void testDefaults() throws IOException {
ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234")));
List<IndexableField> fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.size());
assertEquals("1234", fields.get(0).stringValue());

var reader = fields.get(0).readerValue();
char[] buff = new char[20];
assertEquals(4, reader.read(buff));
assertEquals("1234", new String(buff, 0, 4));

IndexableFieldType fieldType = fields.get(0).fieldType();
assertThat(fieldType.omitNorms(), equalTo(true));
assertTrue(fieldType.tokenized());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.common.text;

import org.elasticsearch.xcontent.XContentString;

import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;

/**
* Reader that decodes UTF-8 formatted bytes into chars.
*/
public class UTF8DecodingReader extends Reader {
Copy link
Contributor Author

@jordan-powers jordan-powers Jun 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is basically equivalent to:

final var reader = new InputStreamReader(new ByteArrayInputStream(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()), StandardCharsets.UTF_8);

But according to the microbenchmarks, using the InputStreamReader/ByteArrayInputStream is really slow, and was actually slower than the original string-based implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are the results from the microbenchmark:

Benchmark                                                  (nDocs)  Mode  Cnt    Score   Error  Units
OptimizedTextBenchmark.indexDocuments (baseline)           1048576  avgt    5  581.242 ± 3.050  ms/op
OptimizedTextBenchmark.indexDocuments (UTF8DecodingReader) 1048576  avgt    5  544.477 ± 4.961  ms/op
OptimizedTextBenchmark.indexDocuments (InputStreamReader)  1048576  avgt    5  852.380 ± 6.238  ms/op

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Yes, this implementation makes a lot of sense then. Did you also run this PR against elastic/logs (enterprise) benchmark?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't yet, but I plan to today

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe make this a final class?

private CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
private ByteBuffer bytes;

public UTF8DecodingReader(ByteBuffer bytes) {
this.bytes = bytes;
}

public UTF8DecodingReader(XContentString.UTF8Bytes utf8bytes) {
this.bytes = ByteBuffer.wrap(utf8bytes.bytes(), utf8bytes.offset(), utf8bytes.length());
}

@Override
public int read(char[] cbuf, int off, int len) {
return read(CharBuffer.wrap(cbuf, off, len));
}

@Override
public int read(CharBuffer cbuf) {
if (bytes.hasRemaining() == false) {
return -1;
}

int startPos = cbuf.position();
decoder.decode(bytes, cbuf, true);
return cbuf.position() - startPos;
}

@Override
public void close() {}
}