Use direct byte[] utf-8 conversions (#136053)

Tim-Brooks · web-flow · commit c3ae278b5c2d · 2025-10-08T09:20:43.000-06:00
Currently Elasticsearch is using StandardCharsets#decode and encode
methods when working with optimized text. These variants are not as
performant as the direct implementations in String when working with
byte[]. If we are going to one-shot convert without validation then the
String variants should be preferred.
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java
@@ -10,6 +10,7 @@
 package org.elasticsearch.benchmark.bytes;
 
 import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.benchmark.common.util.UTF8StringBytesBenchmark;
 import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput;
 import org.elasticsearch.common.recycler.Recycler;
 import org.openjdk.jmh.annotations.Benchmark;
@@ -65,10 +66,10 @@ public void initResults() throws IOException {
         // We use weights to generate certain sized UTF-8 characters and vInts. However, there is still some non-determinism which could
         // impact direct comparisons run-to-run
 
-        shortString = generateAsciiString(20);
-        longString = generateAsciiString(100);
-        nonAsciiString = generateUtf8String(200);
-        veryLongString = generateAsciiString(800);
+        shortString = UTF8StringBytesBenchmark.generateAsciiString(20);
+        longString = UTF8StringBytesBenchmark.generateAsciiString(100);
+        nonAsciiString = UTF8StringBytesBenchmark.generateUTF8String(200);
+        veryLongString = UTF8StringBytesBenchmark.generateAsciiString(800);
         // vint values for benchmarking
         vints = new int[1000];
         for (int i = 0; i < vints.length; i++) {
@@ -143,49 +144,6 @@ public void writeVInt() throws IOException {
         }
     }
 
-    public static String generateAsciiString(int n) {
-        ThreadLocalRandom random = ThreadLocalRandom.current();
-        StringBuilder sb = new StringBuilder(n);
-
-        for (int i = 0; i < n; i++) {
-            int ascii = random.nextInt(128);
-            sb.append((char) ascii);
-        }
-
-        return sb.toString();
-    }
-
-    public static String generateUtf8String(int n) {
-        ThreadLocalRandom random = ThreadLocalRandom.current();
-        StringBuilder sb = new StringBuilder(n);
-
-        for (int i = 0; i < n; i++) {
-            int codePoint;
-            int probability = random.nextInt(100);
-
-            if (probability < 85) {
-                // 1-byte UTF-8 (ASCII range)
-                // 0x0000 to 0x007F
-                codePoint = random.nextInt(0x0080);
-            } else if (probability < 95) {
-                // 2-byte UTF-8
-                // 0x0080 to 0x07FF
-                codePoint = random.nextInt(0x0080, 0x0800);
-            } else {
-                // 3-byte UTF-8
-                // 0x0800 to 0xFFFF
-                do {
-                    codePoint = random.nextInt(0x0800, 0x10000);
-                    // Skip surrogate pairs (0xD800-0xDFFF)
-                } while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
-            }
-
-            sb.appendCodePoint(codePoint);
-        }
-
-        return sb.toString();
-    }
-
     private record BenchmarkRecycler(AtomicReference<BytesRef> bytesRef) implements Recycler<BytesRef> {
 
         @Override
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/common/util/UTF8StringBytesBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/common/util/UTF8StringBytesBenchmark.java
@@ -0,0 +1,148 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.benchmark.common.util;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.elasticsearch.common.UUIDs;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 3)
+@Measurement(iterations = 3)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Fork(value = 1)
+public class UTF8StringBytesBenchmark {
+
+    @State(Scope.Thread)
+    public static class StringState {
+        @Param({ "uuid", "short", "long", "nonAscii", "veryLong" })
+        String stringType;
+
+        String string;
+        BytesRef bytes;
+
+        @Setup
+        public void setup() {
+            string = switch (stringType) {
+                case "uuid" -> UUIDs.base64UUID();
+                case "short" -> generateAsciiString(20);
+                case "long" -> generateAsciiString(100);
+                case "nonAscii" -> generateUTF8String(200);
+                case "veryLong" -> generateAsciiString(1000);
+                default -> throw new IllegalArgumentException("Unknown stringType: " + stringType);
+            };
+            bytes = getBytes(string);
+        }
+    }
+
+    @Benchmark
+    public BytesRef getBytesJDK(StringState state) {
+        byte[] bytes = state.string.getBytes(StandardCharsets.UTF_8);
+        return new BytesRef(bytes, 0, bytes.length);
+    }
+
+    @Benchmark
+    public BytesRef getBytesUnicodeUtils(StringState state) {
+        String string = state.string;
+        int length = string.length();
+        int size = UnicodeUtil.calcUTF16toUTF8Length(string, 0, length);
+        byte[] out = new byte[size];
+        UnicodeUtil.UTF16toUTF8(string, 0, length, out, 0);
+        return new BytesRef(out, 0, out.length);
+    }
+
+    @Benchmark
+    public BytesRef getBytesByteBufferEncoder(StringState state) {
+        var byteBuff = StandardCharsets.UTF_8.encode(state.string);
+        assert byteBuff.hasArray();
+        return new BytesRef(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
+    }
+
+    @Benchmark
+    public String getStringJDK(StringState state) {
+        BytesRef bytes = state.bytes;
+        return new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8);
+    }
+
+    @Benchmark
+    public String getStringByteBufferDecoder(StringState state) {
+        BytesRef bytes = state.bytes;
+        var byteBuff = ByteBuffer.wrap(bytes.bytes, bytes.offset, bytes.length);
+        return StandardCharsets.UTF_8.decode(byteBuff).toString();
+    }
+
+    private static BytesRef getBytes(String string) {
+        int before = ThreadLocalRandom.current().nextInt(0, 50);
+        int after = ThreadLocalRandom.current().nextInt(0, 50);
+        byte[] stringBytes = string.getBytes(StandardCharsets.UTF_8);
+        byte[] finalBytes = new byte[before + after + stringBytes.length];
+        System.arraycopy(stringBytes, 0, finalBytes, before, stringBytes.length);
+        return new BytesRef(finalBytes, before, stringBytes.length);
+    }
+
+    public static String generateAsciiString(int n) {
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        StringBuilder sb = new StringBuilder(n);
+
+        for (int i = 0; i < n; i++) {
+            int ascii = random.nextInt(128);
+            sb.append((char) ascii);
+        }
+
+        return sb.toString();
+    }
+
+    public static String generateUTF8String(int n) {
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        StringBuilder sb = new StringBuilder(n);
+
+        for (int i = 0; i < n; i++) {
+            int codePoint;
+            int probability = random.nextInt(100);
+
+            if (probability < 85) {
+                // 1-byte UTF-8 (ASCII range)
+                // 0x0000 to 0x007F
+                codePoint = random.nextInt(0x0080);
+            } else if (probability < 95) {
+                // 2-byte UTF-8
+                // 0x0080 to 0x07FF
+                codePoint = random.nextInt(0x0080, 0x0800);
+            } else {
+                // 3-byte UTF-8
+                // 0x0800 to 0xFFFF
+                do {
+                    codePoint = random.nextInt(0x0800, 0x10000);
+                    // Skip surrogate pairs (0xD800-0xDFFF)
+                } while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
+            }
+
+            sb.appendCodePoint(codePoint);
+        }
+
+        return sb.toString();
+    }
+}
diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java
@@ -9,7 +9,6 @@
 package org.elasticsearch.xcontent;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 
 /**
@@ -68,9 +67,8 @@ public boolean hasBytes() {
     @Override
     public UTF8Bytes bytes() {
         if (bytes == null) {
-            var byteBuff = StandardCharsets.UTF_8.encode(string);
-            assert byteBuff.hasArray();
-            bytes = new UTF8Bytes(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
+            byte[] byteArray = string.getBytes(StandardCharsets.UTF_8);
+            bytes = new UTF8Bytes(byteArray, 0, byteArray.length);
         }
         return bytes;
     }
@@ -85,8 +83,7 @@ public boolean hasString() {
     @Override
     public String string() {
         if (string == null) {
-            var byteBuff = ByteBuffer.wrap(bytes.bytes(), bytes.offset(), bytes.length());
-            string = StandardCharsets.UTF_8.decode(byteBuff).toString();
+            string = new String(bytes.bytes(), bytes.offset(), bytes.length(), StandardCharsets.UTF_8);
             assert (stringLength < 0) || (string.length() == stringLength);
         }
         return string;