Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.benchmark.xcontent;

import org.elasticsearch.benchmark.index.mapper.MapperServiceFactory;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.logging.LogConfigurator;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.SourceToParse;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentFactory;
import org.elasticsearch.xcontent.XContentType;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Threads;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;
import java.util.Random;
import java.util.concurrent.TimeUnit;

/**
* Benchmark to measure indexing performance of keyword fields. Used to measure performance impact of skipping
* UTF-8 to UTF-16 conversion during document parsing.
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Fork(1)
@Threads(1)
@Warmup(iterations = 1)
@Measurement(iterations = 5)
public class OptimizedTextBenchmark {
static {
// For Elasticsearch900Lucene101Codec:
LogConfigurator.loadLog4jPlugins();
LogConfigurator.configureESLogging();
LogConfigurator.setNodeName("test");
}

/**
* Total number of documents to index.
*/
@Param("1048576")
private int nDocs;

private MapperService mapperService;
private SourceToParse[] sources;

private String randomValue(int length) {
final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
Random random = new Random();
StringBuilder builder = new StringBuilder(length);
for (int i = 0; i < length; i++) {
builder.append(CHARS.charAt(random.nextInt(CHARS.length())));
}
return builder.toString();
}

@Setup(Level.Trial)
public void setup() throws IOException {
mapperService = MapperServiceFactory.create("""
{
"_doc": {
"dynamic": false,
"properties": {
"field": {
"type": "keyword"
}
}
}
}
""");

sources = new SourceToParse[nDocs];
for (int i = 0; i < nDocs; i++) {
XContentBuilder b = XContentFactory.jsonBuilder();
b.startObject().field("field", randomValue(8)).endObject();
sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON);
}
}

@Benchmark
public void indexDocuments(final Blackhole bh) {
final var mapper = mapperService.documentMapper();
for (int i = 0; i < nDocs; i++) {
bh.consume(mapper.parse(sources[i]));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonEncoding;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonFactoryBuilder;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper;
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;

import java.io.IOException;

public class ESJsonFactory extends JsonFactory {
ESJsonFactory(JsonFactoryBuilder b) {
super(b);
}

@Override
protected JsonParser _createParser(byte[] data, int offset, int len, IOContext ctxt) throws IOException {
if (len > 0
&& Feature.CHARSET_DETECTION.enabledIn(_factoryFeatures)
&& Feature.CANONICALIZE_FIELD_NAMES.enabledIn(_factoryFeatures)) {
var bootstrap = new ByteSourceJsonBootstrapper(ctxt, data, offset, len);
var encoding = bootstrap.detectEncoding();
if (encoding == JsonEncoding.UTF8) {
boolean invalidBom = false;
int ptr = offset;
// Skip over the BOM if present
if ((data[ptr] & 0xFF) == 0xEF) {
if (len < 3) {
invalidBom = true;
} else if ((data[ptr + 1] & 0xFF) != 0xBB) {
invalidBom = true;
} else if ((data[ptr + 2] & 0xFF) != 0xBF) {
invalidBom = true;
} else {
ptr += 3;
}
}
if (invalidBom == false) {
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
return new ESUTF8StreamJsonParser(
ctxt,
_parserFeatures,
null,
_objectCodec,
can,
data,
ptr,
offset + len,
ptr - offset,
false
);
}
}
}
return new ByteSourceJsonBootstrapper(ctxt, data, offset, len).constructParser(
_parserFeatures,
_objectCodec,
_byteSymbolCanonicalizer,
_rootCharSymbols,
_factoryFeatures
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonFactoryBuilder;

public class ESJsonFactoryBuilder extends JsonFactoryBuilder {
@Override
public JsonFactory build() {
return new ESJsonFactory(this);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.ObjectCodec;
import com.fasterxml.jackson.core.SerializableString;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.json.UTF8StreamJsonParser;
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;

import org.elasticsearch.xcontent.Text;
import org.elasticsearch.xcontent.XContentString;

import java.io.IOException;
import java.io.InputStream;

public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
protected int stringEnd = -1;

public ESUTF8StreamJsonParser(
IOContext ctxt,
int features,
InputStream in,
ObjectCodec codec,
ByteQuadsCanonicalizer sym,
byte[] inputBuffer,
int start,
int end,
int bytesPreProcessed,
boolean bufferRecyclable
) {
super(ctxt, features, in, codec, sym, inputBuffer, start, end, bytesPreProcessed, bufferRecyclable);
}

/**
* Method that will try to get underlying UTF-8 encoded bytes of the current string token.
* This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
* Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
*/
public Text getValueAsText() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
if (stringEnd > 0) {
final int len = stringEnd - 1 - _inputPtr;
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
// which means each character uses exactly 1 byte.
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
}
return _finishAndReturnText();
}
return null;
}

protected Text _finishAndReturnText() throws IOException {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
_loadMoreGuaranteed();
ptr = _inputPtr;
}

int startPtr = ptr;
final int[] codes = INPUT_CODES_UTF8;
final int max = _inputEnd;
final byte[] inputBuffer = _inputBuffer;
while (ptr < max) {
int c = inputBuffer[ptr] & 0xFF;
if (codes[c] != 0) {
if (c == INT_QUOTE) {
stringEnd = ptr + 1;
final int len = ptr - startPtr;
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
// which means each character uses exactly 1 byte.
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
}
return null;
}
++ptr;
}
return null;
}

@Override
public JsonToken nextToken() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
}
stringEnd = -1;
return super.nextToken();
}

@Override
public boolean nextFieldName(SerializableString str) throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
}
stringEnd = -1;
return super.nextFieldName(str);
}

@Override
public String nextFieldName() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
}
stringEnd = -1;
return super.nextFieldName();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import com.fasterxml.jackson.core.JsonEncoding;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonFactoryBuilder;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;

Expand Down Expand Up @@ -47,7 +46,7 @@ public static final XContent jsonXContent() {
}

static {
jsonFactory = XContentImplUtils.configure(new JsonFactoryBuilder());
jsonFactory = XContentImplUtils.configure(new ESJsonFactoryBuilder());
jsonFactory.configure(JsonGenerator.Feature.QUOTE_FIELD_NAMES, true);
jsonFactory.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
jsonFactory.configure(JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW, false); // this trips on many mappings now...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
import com.fasterxml.jackson.core.io.JsonEOFException;

import org.elasticsearch.core.IOUtils;
import org.elasticsearch.xcontent.Text;
import org.elasticsearch.xcontent.XContentEOFException;
import org.elasticsearch.xcontent.XContentLocation;
import org.elasticsearch.xcontent.XContentParseException;
import org.elasticsearch.xcontent.XContentParserConfiguration;
import org.elasticsearch.xcontent.XContentString;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl;
import org.elasticsearch.xcontent.support.AbstractXContentParser;
Expand Down Expand Up @@ -115,6 +117,20 @@ public String text() throws IOException {
}
}

@Override
public XContentString optimizedText() throws IOException {
if (currentToken().isValue() == false) {
throwOnNoText();
}
if (parser instanceof ESUTF8StreamJsonParser esParser) {
var bytesRef = esParser.getValueAsText();
if (bytesRef != null) {
return bytesRef;
}
}
return new Text(text());
}

private void throwOnNoText() {
throw new IllegalArgumentException("Expected text at " + getTokenLocation() + " but found " + currentToken());
}
Expand Down
Loading