Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
package org.elasticsearch.benchmark.bytes;

import org.apache.lucene.util.BytesRef;
import org.elasticsearch.benchmark.common.util.UTF8StringBytesBenchmark;
import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput;
import org.elasticsearch.common.recycler.Recycler;
import org.openjdk.jmh.annotations.Benchmark;
Expand Down Expand Up @@ -65,10 +66,10 @@ public void initResults() throws IOException {
// We use weights to generate certain sized UTF-8 characters and vInts. However, there is still some non-determinism which could
// impact direct comparisons run-to-run

shortString = generateAsciiString(20);
longString = generateAsciiString(100);
nonAsciiString = generateUtf8String(200);
veryLongString = generateAsciiString(800);
shortString = UTF8StringBytesBenchmark.generateAsciiString(20);
longString = UTF8StringBytesBenchmark.generateAsciiString(100);
nonAsciiString = UTF8StringBytesBenchmark.generateUTF8String(200);
veryLongString = UTF8StringBytesBenchmark.generateAsciiString(800);
// vint values for benchmarking
vints = new int[1000];
for (int i = 0; i < vints.length; i++) {
Expand Down Expand Up @@ -143,49 +144,6 @@ public void writeVInt() throws IOException {
}
}

public static String generateAsciiString(int n) {
ThreadLocalRandom random = ThreadLocalRandom.current();
StringBuilder sb = new StringBuilder(n);

for (int i = 0; i < n; i++) {
int ascii = random.nextInt(128);
sb.append((char) ascii);
}

return sb.toString();
}

public static String generateUtf8String(int n) {
ThreadLocalRandom random = ThreadLocalRandom.current();
StringBuilder sb = new StringBuilder(n);

for (int i = 0; i < n; i++) {
int codePoint;
int probability = random.nextInt(100);

if (probability < 85) {
// 1-byte UTF-8 (ASCII range)
// 0x0000 to 0x007F
codePoint = random.nextInt(0x0080);
} else if (probability < 95) {
// 2-byte UTF-8
// 0x0080 to 0x07FF
codePoint = random.nextInt(0x0080, 0x0800);
} else {
// 3-byte UTF-8
// 0x0800 to 0xFFFF
do {
codePoint = random.nextInt(0x0800, 0x10000);
// Skip surrogate pairs (0xD800-0xDFFF)
} while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
}

sb.appendCodePoint(codePoint);
}

return sb.toString();
}

private record BenchmarkRecycler(AtomicReference<BytesRef> bytesRef) implements Recycler<BytesRef> {

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.benchmark.common.util;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.common.UUIDs;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;

@Warmup(iterations = 3)
@Measurement(iterations = 3)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(value = 1)
public class UTF8StringBytesBenchmark {

@State(Scope.Thread)
public static class StringState {
@Param({ "uuid", "short", "long", "nonAscii", "veryLong" })
String stringType;

String string;
BytesRef bytes;

@Setup
public void setup() {
string = switch (stringType) {
case "uuid" -> UUIDs.base64UUID();
case "short" -> generateAsciiString(20);
case "long" -> generateAsciiString(100);
case "nonAscii" -> generateUTF8String(200);
case "veryLong" -> generateAsciiString(1000);
default -> throw new IllegalArgumentException("Unknown stringType: " + stringType);
};
bytes = getBytes(string);
}
}

@Benchmark
public BytesRef getBytesJDK(StringState state) {
byte[] bytes = state.string.getBytes(StandardCharsets.UTF_8);
return new BytesRef(bytes, 0, bytes.length);
}

@Benchmark
public BytesRef getBytesUnicodeUtils(StringState state) {
String string = state.string;
int length = string.length();
int size = UnicodeUtil.calcUTF16toUTF8Length(string, 0, length);
byte[] out = new byte[size];
UnicodeUtil.UTF16toUTF8(string, 0, length, out, 0);
return new BytesRef(out, 0, out.length);
}

@Benchmark
public BytesRef getBytesByteBufferEncoder(StringState state) {
var byteBuff = StandardCharsets.UTF_8.encode(state.string);
assert byteBuff.hasArray();
return new BytesRef(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
}

@Benchmark
public String getStringJDK(StringState state) {
BytesRef bytes = state.bytes;
return new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8);
}

@Benchmark
public String getStringByteBufferDecoder(StringState state) {
BytesRef bytes = state.bytes;
var byteBuff = ByteBuffer.wrap(bytes.bytes, bytes.offset, bytes.length);
return StandardCharsets.UTF_8.decode(byteBuff).toString();
}

private static BytesRef getBytes(String string) {
int before = ThreadLocalRandom.current().nextInt(0, 50);
int after = ThreadLocalRandom.current().nextInt(0, 50);
byte[] stringBytes = string.getBytes(StandardCharsets.UTF_8);
byte[] finalBytes = new byte[before + after + stringBytes.length];
System.arraycopy(stringBytes, 0, finalBytes, before, stringBytes.length);
return new BytesRef(finalBytes, before, stringBytes.length);
}

public static String generateAsciiString(int n) {
ThreadLocalRandom random = ThreadLocalRandom.current();
StringBuilder sb = new StringBuilder(n);

for (int i = 0; i < n; i++) {
int ascii = random.nextInt(128);
sb.append((char) ascii);
}

return sb.toString();
}

public static String generateUTF8String(int n) {
ThreadLocalRandom random = ThreadLocalRandom.current();
StringBuilder sb = new StringBuilder(n);

for (int i = 0; i < n; i++) {
int codePoint;
int probability = random.nextInt(100);

if (probability < 85) {
// 1-byte UTF-8 (ASCII range)
// 0x0000 to 0x007F
codePoint = random.nextInt(0x0080);
} else if (probability < 95) {
// 2-byte UTF-8
// 0x0080 to 0x07FF
codePoint = random.nextInt(0x0080, 0x0800);
} else {
// 3-byte UTF-8
// 0x0800 to 0xFFFF
do {
codePoint = random.nextInt(0x0800, 0x10000);
// Skip surrogate pairs (0xD800-0xDFFF)
} while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
}

sb.appendCodePoint(codePoint);
}

return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
package org.elasticsearch.xcontent;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

/**
Expand Down Expand Up @@ -68,9 +67,8 @@ public boolean hasBytes() {
@Override
public UTF8Bytes bytes() {
if (bytes == null) {
var byteBuff = StandardCharsets.UTF_8.encode(string);
assert byteBuff.hasArray();
bytes = new UTF8Bytes(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
byte[] byteArray = string.getBytes(StandardCharsets.UTF_8);
bytes = new UTF8Bytes(byteArray, 0, byteArray.length);
}
return bytes;
}
Expand All @@ -85,8 +83,7 @@ public boolean hasString() {
@Override
public String string() {
if (string == null) {
var byteBuff = ByteBuffer.wrap(bytes.bytes(), bytes.offset(), bytes.length());
string = StandardCharsets.UTF_8.decode(byteBuff).toString();
string = new String(bytes.bytes(), bytes.offset(), bytes.length(), StandardCharsets.UTF_8);
assert (stringLength < 0) || (string.length() == stringLength);
}
return string;
Expand Down