diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java index 691fd3a2dfb0f..99ff455a360c4 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java @@ -10,6 +10,7 @@ package org.elasticsearch.benchmark.bytes; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.benchmark.common.util.UTF8StringBytesBenchmark; import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput; import org.elasticsearch.common.recycler.Recycler; import org.openjdk.jmh.annotations.Benchmark; @@ -65,10 +66,10 @@ public void initResults() throws IOException { // We use weights to generate certain sized UTF-8 characters and vInts. However, there is still some non-determinism which could // impact direct comparisons run-to-run - shortString = generateAsciiString(20); - longString = generateAsciiString(100); - nonAsciiString = generateUtf8String(200); - veryLongString = generateAsciiString(800); + shortString = UTF8StringBytesBenchmark.generateAsciiString(20); + longString = UTF8StringBytesBenchmark.generateAsciiString(100); + nonAsciiString = UTF8StringBytesBenchmark.generateUTF8String(200); + veryLongString = UTF8StringBytesBenchmark.generateAsciiString(800); // vint values for benchmarking vints = new int[1000]; for (int i = 0; i < vints.length; i++) { @@ -143,49 +144,6 @@ public void writeVInt() throws IOException { } } - public static String generateAsciiString(int n) { - ThreadLocalRandom random = ThreadLocalRandom.current(); - StringBuilder sb = new StringBuilder(n); - - for (int i = 0; i < n; i++) { - int ascii = random.nextInt(128); - sb.append((char) ascii); - } - - return sb.toString(); - } - - public static String generateUtf8String(int n) { - ThreadLocalRandom random = ThreadLocalRandom.current(); - StringBuilder sb = new StringBuilder(n); - - for (int i = 0; i < n; i++) { - int codePoint; - int probability = random.nextInt(100); - - if (probability < 85) { - // 1-byte UTF-8 (ASCII range) - // 0x0000 to 0x007F - codePoint = random.nextInt(0x0080); - } else if (probability < 95) { - // 2-byte UTF-8 - // 0x0080 to 0x07FF - codePoint = random.nextInt(0x0080, 0x0800); - } else { - // 3-byte UTF-8 - // 0x0800 to 0xFFFF - do { - codePoint = random.nextInt(0x0800, 0x10000); - // Skip surrogate pairs (0xD800-0xDFFF) - } while (codePoint >= 0xD800 && codePoint <= 0xDFFF); - } - - sb.appendCodePoint(codePoint); - } - - return sb.toString(); - } - private record BenchmarkRecycler(AtomicReference bytesRef) implements Recycler { @Override diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/common/util/UTF8StringBytesBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/common/util/UTF8StringBytesBenchmark.java new file mode 100644 index 0000000000000..bdd38e372ef01 --- /dev/null +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/common/util/UTF8StringBytesBenchmark.java @@ -0,0 +1,148 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.benchmark.common.util; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.common.UUIDs; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 3) +@Measurement(iterations = 3) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(value = 1) +public class UTF8StringBytesBenchmark { + + @State(Scope.Thread) + public static class StringState { + @Param({ "uuid", "short", "long", "nonAscii", "veryLong" }) + String stringType; + + String string; + BytesRef bytes; + + @Setup + public void setup() { + string = switch (stringType) { + case "uuid" -> UUIDs.base64UUID(); + case "short" -> generateAsciiString(20); + case "long" -> generateAsciiString(100); + case "nonAscii" -> generateUTF8String(200); + case "veryLong" -> generateAsciiString(1000); + default -> throw new IllegalArgumentException("Unknown stringType: " + stringType); + }; + bytes = getBytes(string); + } + } + + @Benchmark + public BytesRef getBytesJDK(StringState state) { + byte[] bytes = state.string.getBytes(StandardCharsets.UTF_8); + return new BytesRef(bytes, 0, bytes.length); + } + + @Benchmark + public BytesRef getBytesUnicodeUtils(StringState state) { + String string = state.string; + int length = string.length(); + int size = UnicodeUtil.calcUTF16toUTF8Length(string, 0, length); + byte[] out = new byte[size]; + UnicodeUtil.UTF16toUTF8(string, 0, length, out, 0); + return new BytesRef(out, 0, out.length); + } + + @Benchmark + public BytesRef getBytesByteBufferEncoder(StringState state) { + var byteBuff = StandardCharsets.UTF_8.encode(state.string); + assert byteBuff.hasArray(); + return new BytesRef(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining()); + } + + @Benchmark + public String getStringJDK(StringState state) { + BytesRef bytes = state.bytes; + return new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8); + } + + @Benchmark + public String getStringByteBufferDecoder(StringState state) { + BytesRef bytes = state.bytes; + var byteBuff = ByteBuffer.wrap(bytes.bytes, bytes.offset, bytes.length); + return StandardCharsets.UTF_8.decode(byteBuff).toString(); + } + + private static BytesRef getBytes(String string) { + int before = ThreadLocalRandom.current().nextInt(0, 50); + int after = ThreadLocalRandom.current().nextInt(0, 50); + byte[] stringBytes = string.getBytes(StandardCharsets.UTF_8); + byte[] finalBytes = new byte[before + after + stringBytes.length]; + System.arraycopy(stringBytes, 0, finalBytes, before, stringBytes.length); + return new BytesRef(finalBytes, before, stringBytes.length); + } + + public static String generateAsciiString(int n) { + ThreadLocalRandom random = ThreadLocalRandom.current(); + StringBuilder sb = new StringBuilder(n); + + for (int i = 0; i < n; i++) { + int ascii = random.nextInt(128); + sb.append((char) ascii); + } + + return sb.toString(); + } + + public static String generateUTF8String(int n) { + ThreadLocalRandom random = ThreadLocalRandom.current(); + StringBuilder sb = new StringBuilder(n); + + for (int i = 0; i < n; i++) { + int codePoint; + int probability = random.nextInt(100); + + if (probability < 85) { + // 1-byte UTF-8 (ASCII range) + // 0x0000 to 0x007F + codePoint = random.nextInt(0x0080); + } else if (probability < 95) { + // 2-byte UTF-8 + // 0x0080 to 0x07FF + codePoint = random.nextInt(0x0080, 0x0800); + } else { + // 3-byte UTF-8 + // 0x0800 to 0xFFFF + do { + codePoint = random.nextInt(0x0800, 0x10000); + // Skip surrogate pairs (0xD800-0xDFFF) + } while (codePoint >= 0xD800 && codePoint <= 0xDFFF); + } + + sb.appendCodePoint(codePoint); + } + + return sb.toString(); + } +} diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java index bd0168bbc6684..48c8ddba14bc7 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java @@ -9,7 +9,6 @@ package org.elasticsearch.xcontent; import java.io.IOException; -import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; /** @@ -68,9 +67,8 @@ public boolean hasBytes() { @Override public UTF8Bytes bytes() { if (bytes == null) { - var byteBuff = StandardCharsets.UTF_8.encode(string); - assert byteBuff.hasArray(); - bytes = new UTF8Bytes(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining()); + byte[] byteArray = string.getBytes(StandardCharsets.UTF_8); + bytes = new UTF8Bytes(byteArray, 0, byteArray.length); } return bytes; } @@ -85,8 +83,7 @@ public boolean hasString() { @Override public String string() { if (string == null) { - var byteBuff = ByteBuffer.wrap(bytes.bytes(), bytes.offset(), bytes.length()); - string = StandardCharsets.UTF_8.decode(byteBuff).toString(); + string = new String(bytes.bytes(), bytes.offset(), bytes.length(), StandardCharsets.UTF_8); assert (stringLength < 0) || (string.length() == stringLength); } return string;