Skip to content

Commit c3ae278

Browse files
authored
Use direct byte[] utf-8 conversions (#136053)
Currently Elasticsearch is using StandardCharsets#decode and encode methods when working with optimized text. These variants are not as performant as the direct implementations in String when working with byte[]. If we are going to one-shot convert without validation then the String variants should be preferred.
1 parent 5ea2d96 commit c3ae278

File tree

3 files changed

+156
-53
lines changed

3 files changed

+156
-53
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java

Lines changed: 5 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
package org.elasticsearch.benchmark.bytes;
1111

1212
import org.apache.lucene.util.BytesRef;
13+
import org.elasticsearch.benchmark.common.util.UTF8StringBytesBenchmark;
1314
import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput;
1415
import org.elasticsearch.common.recycler.Recycler;
1516
import org.openjdk.jmh.annotations.Benchmark;
@@ -65,10 +66,10 @@ public void initResults() throws IOException {
6566
// We use weights to generate certain sized UTF-8 characters and vInts. However, there is still some non-determinism which could
6667
// impact direct comparisons run-to-run
6768

68-
shortString = generateAsciiString(20);
69-
longString = generateAsciiString(100);
70-
nonAsciiString = generateUtf8String(200);
71-
veryLongString = generateAsciiString(800);
69+
shortString = UTF8StringBytesBenchmark.generateAsciiString(20);
70+
longString = UTF8StringBytesBenchmark.generateAsciiString(100);
71+
nonAsciiString = UTF8StringBytesBenchmark.generateUTF8String(200);
72+
veryLongString = UTF8StringBytesBenchmark.generateAsciiString(800);
7273
// vint values for benchmarking
7374
vints = new int[1000];
7475
for (int i = 0; i < vints.length; i++) {
@@ -143,49 +144,6 @@ public void writeVInt() throws IOException {
143144
}
144145
}
145146

146-
public static String generateAsciiString(int n) {
147-
ThreadLocalRandom random = ThreadLocalRandom.current();
148-
StringBuilder sb = new StringBuilder(n);
149-
150-
for (int i = 0; i < n; i++) {
151-
int ascii = random.nextInt(128);
152-
sb.append((char) ascii);
153-
}
154-
155-
return sb.toString();
156-
}
157-
158-
public static String generateUtf8String(int n) {
159-
ThreadLocalRandom random = ThreadLocalRandom.current();
160-
StringBuilder sb = new StringBuilder(n);
161-
162-
for (int i = 0; i < n; i++) {
163-
int codePoint;
164-
int probability = random.nextInt(100);
165-
166-
if (probability < 85) {
167-
// 1-byte UTF-8 (ASCII range)
168-
// 0x0000 to 0x007F
169-
codePoint = random.nextInt(0x0080);
170-
} else if (probability < 95) {
171-
// 2-byte UTF-8
172-
// 0x0080 to 0x07FF
173-
codePoint = random.nextInt(0x0080, 0x0800);
174-
} else {
175-
// 3-byte UTF-8
176-
// 0x0800 to 0xFFFF
177-
do {
178-
codePoint = random.nextInt(0x0800, 0x10000);
179-
// Skip surrogate pairs (0xD800-0xDFFF)
180-
} while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
181-
}
182-
183-
sb.appendCodePoint(codePoint);
184-
}
185-
186-
return sb.toString();
187-
}
188-
189147
private record BenchmarkRecycler(AtomicReference<BytesRef> bytesRef) implements Recycler<BytesRef> {
190148

191149
@Override
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.benchmark.common.util;
11+
12+
import org.apache.lucene.util.BytesRef;
13+
import org.apache.lucene.util.UnicodeUtil;
14+
import org.elasticsearch.common.UUIDs;
15+
import org.openjdk.jmh.annotations.Benchmark;
16+
import org.openjdk.jmh.annotations.BenchmarkMode;
17+
import org.openjdk.jmh.annotations.Fork;
18+
import org.openjdk.jmh.annotations.Measurement;
19+
import org.openjdk.jmh.annotations.Mode;
20+
import org.openjdk.jmh.annotations.OutputTimeUnit;
21+
import org.openjdk.jmh.annotations.Param;
22+
import org.openjdk.jmh.annotations.Scope;
23+
import org.openjdk.jmh.annotations.Setup;
24+
import org.openjdk.jmh.annotations.State;
25+
import org.openjdk.jmh.annotations.Warmup;
26+
27+
import java.nio.ByteBuffer;
28+
import java.nio.charset.StandardCharsets;
29+
import java.util.concurrent.ThreadLocalRandom;
30+
import java.util.concurrent.TimeUnit;
31+
32+
@Warmup(iterations = 3)
33+
@Measurement(iterations = 3)
34+
@BenchmarkMode(Mode.AverageTime)
35+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
36+
@Fork(value = 1)
37+
public class UTF8StringBytesBenchmark {
38+
39+
@State(Scope.Thread)
40+
public static class StringState {
41+
@Param({ "uuid", "short", "long", "nonAscii", "veryLong" })
42+
String stringType;
43+
44+
String string;
45+
BytesRef bytes;
46+
47+
@Setup
48+
public void setup() {
49+
string = switch (stringType) {
50+
case "uuid" -> UUIDs.base64UUID();
51+
case "short" -> generateAsciiString(20);
52+
case "long" -> generateAsciiString(100);
53+
case "nonAscii" -> generateUTF8String(200);
54+
case "veryLong" -> generateAsciiString(1000);
55+
default -> throw new IllegalArgumentException("Unknown stringType: " + stringType);
56+
};
57+
bytes = getBytes(string);
58+
}
59+
}
60+
61+
@Benchmark
62+
public BytesRef getBytesJDK(StringState state) {
63+
byte[] bytes = state.string.getBytes(StandardCharsets.UTF_8);
64+
return new BytesRef(bytes, 0, bytes.length);
65+
}
66+
67+
@Benchmark
68+
public BytesRef getBytesUnicodeUtils(StringState state) {
69+
String string = state.string;
70+
int length = string.length();
71+
int size = UnicodeUtil.calcUTF16toUTF8Length(string, 0, length);
72+
byte[] out = new byte[size];
73+
UnicodeUtil.UTF16toUTF8(string, 0, length, out, 0);
74+
return new BytesRef(out, 0, out.length);
75+
}
76+
77+
@Benchmark
78+
public BytesRef getBytesByteBufferEncoder(StringState state) {
79+
var byteBuff = StandardCharsets.UTF_8.encode(state.string);
80+
assert byteBuff.hasArray();
81+
return new BytesRef(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
82+
}
83+
84+
@Benchmark
85+
public String getStringJDK(StringState state) {
86+
BytesRef bytes = state.bytes;
87+
return new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8);
88+
}
89+
90+
@Benchmark
91+
public String getStringByteBufferDecoder(StringState state) {
92+
BytesRef bytes = state.bytes;
93+
var byteBuff = ByteBuffer.wrap(bytes.bytes, bytes.offset, bytes.length);
94+
return StandardCharsets.UTF_8.decode(byteBuff).toString();
95+
}
96+
97+
private static BytesRef getBytes(String string) {
98+
int before = ThreadLocalRandom.current().nextInt(0, 50);
99+
int after = ThreadLocalRandom.current().nextInt(0, 50);
100+
byte[] stringBytes = string.getBytes(StandardCharsets.UTF_8);
101+
byte[] finalBytes = new byte[before + after + stringBytes.length];
102+
System.arraycopy(stringBytes, 0, finalBytes, before, stringBytes.length);
103+
return new BytesRef(finalBytes, before, stringBytes.length);
104+
}
105+
106+
public static String generateAsciiString(int n) {
107+
ThreadLocalRandom random = ThreadLocalRandom.current();
108+
StringBuilder sb = new StringBuilder(n);
109+
110+
for (int i = 0; i < n; i++) {
111+
int ascii = random.nextInt(128);
112+
sb.append((char) ascii);
113+
}
114+
115+
return sb.toString();
116+
}
117+
118+
public static String generateUTF8String(int n) {
119+
ThreadLocalRandom random = ThreadLocalRandom.current();
120+
StringBuilder sb = new StringBuilder(n);
121+
122+
for (int i = 0; i < n; i++) {
123+
int codePoint;
124+
int probability = random.nextInt(100);
125+
126+
if (probability < 85) {
127+
// 1-byte UTF-8 (ASCII range)
128+
// 0x0000 to 0x007F
129+
codePoint = random.nextInt(0x0080);
130+
} else if (probability < 95) {
131+
// 2-byte UTF-8
132+
// 0x0080 to 0x07FF
133+
codePoint = random.nextInt(0x0080, 0x0800);
134+
} else {
135+
// 3-byte UTF-8
136+
// 0x0800 to 0xFFFF
137+
do {
138+
codePoint = random.nextInt(0x0800, 0x10000);
139+
// Skip surrogate pairs (0xD800-0xDFFF)
140+
} while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
141+
}
142+
143+
sb.appendCodePoint(codePoint);
144+
}
145+
146+
return sb.toString();
147+
}
148+
}

libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
package org.elasticsearch.xcontent;
1010

1111
import java.io.IOException;
12-
import java.nio.ByteBuffer;
1312
import java.nio.charset.StandardCharsets;
1413

1514
/**
@@ -68,9 +67,8 @@ public boolean hasBytes() {
6867
@Override
6968
public UTF8Bytes bytes() {
7069
if (bytes == null) {
71-
var byteBuff = StandardCharsets.UTF_8.encode(string);
72-
assert byteBuff.hasArray();
73-
bytes = new UTF8Bytes(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
70+
byte[] byteArray = string.getBytes(StandardCharsets.UTF_8);
71+
bytes = new UTF8Bytes(byteArray, 0, byteArray.length);
7472
}
7573
return bytes;
7674
}
@@ -85,8 +83,7 @@ public boolean hasString() {
8583
@Override
8684
public String string() {
8785
if (string == null) {
88-
var byteBuff = ByteBuffer.wrap(bytes.bytes(), bytes.offset(), bytes.length());
89-
string = StandardCharsets.UTF_8.decode(byteBuff).toString();
86+
string = new String(bytes.bytes(), bytes.offset(), bytes.length(), StandardCharsets.UTF_8);
9087
assert (stringLength < 0) || (string.length() == stringLength);
9188
}
9289
return string;

0 commit comments

Comments
 (0)