Add UTF-8 code-point byte-length helper for Trino-style pad loops

dain · dain · commit 605b373fb749 · 2026-03-09T09:53:57.000-07:00
Adds codePointByteLengths so callers can decode UTF-8 once and directly
materialize per-code-point byte widths (1..4) for padding/loop planning.

Benchmark (SliceUtf8Benchmark, length=128 code points):

- ascii=true: helper(byte[]) 0.696 ns/codepoint vs Trino byte[] baseline 1.020 ns/codepoint

- ascii=false: helper(byte[]) 2.129 ns/codepoint vs Trino byte[] baseline 3.596 ns/codepoint
diff --git a/src/main/java/io/airlift/slice/SliceUtf8.java b/src/main/java/io/airlift/slice/SliceUtf8.java
@@ -1666,6 +1666,64 @@ else if (codePoint < 0x1_0000) {
         return Arrays.copyOf(codePoints, codePointCount);
     }
 
+    /**
+     * Decodes UTF-8 and returns UTF-8 byte lengths ({@code 1..4}) for each code point.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static byte[] codePointByteLengths(Slice utf8)
+    {
+        return codePointByteLengths(utf8.byteArray(), utf8.byteArrayOffset(), utf8.length());
+    }
+
+    /**
+     * Decodes UTF-8 byte array range and returns UTF-8 byte lengths ({@code 1..4}) for each code point.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static byte[] codePointByteLengths(byte[] utf8, int offset, int length)
+    {
+        checkFromIndexSize(offset, length, utf8.length);
+        return codePointByteLengthsRaw(utf8, offset, length);
+    }
+
+    private static byte[] codePointByteLengthsRaw(byte[] utf8, int utf8Offset, int utf8Length)
+    {
+        if (utf8Length == 0) {
+            return new byte[0];
+        }
+
+        if (isAsciiRaw(utf8, utf8Offset, utf8Length)) {
+            byte[] lengths = new byte[utf8Length];
+            Arrays.fill(lengths, (byte) 1);
+            return lengths;
+        }
+
+        byte[] lengths = new byte[Math.max(8, utf8Length >>> 1)];
+        int codePointCount = 0;
+        int position = 0;
+        while (position < utf8Length) {
+            int codePointLength = lengthOfCodePointFromStartByteSafe(utf8[utf8Offset + position]);
+            if (codePointLength < 0 || position + codePointLength > utf8Length) {
+                throw new InvalidUtf8Exception("Invalid UTF-8 sequence at position " + position);
+            }
+
+            if (codePointCount == lengths.length) {
+                lengths = Arrays.copyOf(lengths, lengths.length * 2);
+            }
+            lengths[codePointCount] = (byte) codePointLength;
+            codePointCount++;
+            position += codePointLength;
+        }
+
+        if (codePointCount == lengths.length) {
+            return lengths;
+        }
+        return Arrays.copyOf(lengths, codePointCount);
+    }
+
     /**
      * Encodes Unicode code points into UTF-8.
      *
diff --git a/src/test/java/io/airlift/slice/SliceUtf8Benchmark.java b/src/test/java/io/airlift/slice/SliceUtf8Benchmark.java
@@ -33,6 +33,7 @@
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.stream.IntStream;
 
+import static io.airlift.slice.SliceUtf8.codePointByteLengths;
 import static io.airlift.slice.SliceUtf8.codePointToUtf8;
 import static io.airlift.slice.SliceUtf8.compareUtf16BE;
 import static io.airlift.slice.SliceUtf8.countCodePoints;
@@ -339,6 +340,125 @@ else if ((currentChar == '%') || (currentChar == '_')) {
         return position;
     }
 
+    @Benchmark
+    public int benchmarkTrinoPadStringCodePointLengths(TrinoPadData data)
+    {
+        Slice padString = data.getPadString();
+        int padStringLength = countCodePoints(padString);
+        int[] padStringCounts = new int[padStringLength];
+        for (int index = 0; index < padStringLength; index++) {
+            padStringCounts[index] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, index));
+        }
+        return checksum(padStringCounts);
+    }
+
+    @Benchmark
+    public int benchmarkTrinoPadStringCodePointLengthsSinglePass(TrinoPadData data)
+    {
+        Slice padString = data.getPadString();
+        int[] padStringCounts = new int[countCodePoints(padString)];
+        int position = 0;
+        int index = 0;
+        while (position < padString.length()) {
+            int codePoint = getCodePointAt(padString, position);
+            int codePointLength = lengthOfCodePoint(codePoint);
+            padStringCounts[index] = codePointLength;
+            index++;
+            position += codePointLength;
+        }
+        if (index != padStringCounts.length) {
+            throw new AssertionError();
+        }
+        return checksum(padStringCounts);
+    }
+
+    @Benchmark
+    public int benchmarkTrinoPadStringCodePointLengthsByteArray(TrinoPadData data)
+    {
+        byte[] utf8 = data.getUtf8();
+        int baseOffset = data.getOffset();
+        int byteLength = data.getByteLength();
+        int[] padStringCounts = new int[countCodePoints(utf8, baseOffset, byteLength)];
+        int position = 0;
+        int index = 0;
+        while (position < byteLength) {
+            int codePoint = getCodePointAt(utf8, baseOffset, byteLength, position);
+            int codePointLength = lengthOfCodePoint(codePoint);
+            padStringCounts[index] = codePointLength;
+            index++;
+            position += codePointLength;
+        }
+        if (index != padStringCounts.length) {
+            throw new AssertionError();
+        }
+        return checksum(padStringCounts);
+    }
+
+    @Benchmark
+    public int benchmarkTrinoPadStringCodePointLengthsSliceUtf8Helper(TrinoPadData data)
+    {
+        return checksum(codePointByteLengths(data.getPadString()));
+    }
+
+    @Benchmark
+    public int benchmarkTrinoPadStringCodePointLengthsSliceUtf8HelperByteArray(TrinoPadData data)
+    {
+        return checksum(codePointByteLengths(data.getUtf8(), data.getOffset(), data.getByteLength()));
+    }
+
+    @Benchmark
+    public Slice benchmarkTrinoDomainTranslatorPrefixRange(TrinoPrefixRangeData data)
+    {
+        Slice constantPrefix = data.getConstantPrefix();
+
+        int lastIncrementable = -1;
+        for (int position = 0; position < constantPrefix.length(); position += lengthOfCodePoint(constantPrefix, position)) {
+            if (getCodePointAt(constantPrefix, position) < 127) {
+                lastIncrementable = position;
+            }
+        }
+
+        if (lastIncrementable == -1) {
+            return Slices.EMPTY_SLICE;
+        }
+
+        Slice upperBound = constantPrefix.slice(0, lastIncrementable + lengthOfCodePoint(constantPrefix, lastIncrementable)).copy();
+        setCodePointAt(getCodePointAt(constantPrefix, lastIncrementable) + 1, upperBound, lastIncrementable);
+        return upperBound;
+    }
+
+    @Benchmark
+    public Slice benchmarkTrinoDomainTranslatorPrefixRangeSingleDecode(TrinoPrefixRangeData data)
+    {
+        byte[] utf8 = data.getUtf8();
+        int baseOffset = data.getOffset();
+        int byteLength = data.getByteLength();
+        Slice constantPrefix = data.getConstantPrefix();
+
+        int lastIncrementableOffset = -1;
+        int lastIncrementableCodePoint = -1;
+        int lastIncrementableLength = 0;
+        int position = 0;
+        while (position < byteLength) {
+            int codePoint = getCodePointAt(utf8, baseOffset, byteLength, position);
+            int codePointLength = lengthOfCodePoint(codePoint);
+            if (codePoint < 127) {
+                lastIncrementableOffset = position;
+                lastIncrementableCodePoint = codePoint;
+                lastIncrementableLength = codePointLength;
+            }
+            position += codePointLength;
+        }
+
+        if (lastIncrementableOffset == -1) {
+            return Slices.EMPTY_SLICE;
+        }
+
+        Slice upperBound = constantPrefix.slice(0, lastIncrementableOffset + lastIncrementableLength).copy();
+        setCodePointAt(lastIncrementableCodePoint + 1, upperBound, lastIncrementableOffset);
+        return upperBound;
+    }
+
     @Benchmark
     public int benchmarkCompareUtf16BE(CompareData data)
     {
@@ -452,6 +572,24 @@ public int benchmarkCodePointToUtf8(CodePointWriteData data)
         return totalBytes;
     }
 
+    private static int checksum(int[] values)
+    {
+        int checksum = 1;
+        for (int value : values) {
+            checksum = (31 * checksum) ^ value;
+        }
+        return checksum;
+    }
+
+    private static int checksum(byte[] values)
+    {
+        int checksum = 1;
+        for (byte value : values) {
+            checksum = (31 * checksum) ^ value;
+        }
+        return checksum;
+    }
+
     @State(Thread)
     public static class BenchmarkData
     {
@@ -814,6 +952,120 @@ public int getEscapeChar()
         }
     }
 
+    @State(Thread)
+    public static class TrinoPadData
+    {
+        @Param("128")
+        private int length;
+
+        @Param({"true", "false"})
+        private boolean ascii;
+
+        private byte[] utf8;
+        private int offset;
+        private int byteLength;
+        private Slice padString;
+
+        @Setup
+        public void setup()
+        {
+            int[] codePointSet = ascii ? BenchmarkData.ASCII_CODE_POINTS : BenchmarkData.ALL_CODE_POINTS;
+            ThreadLocalRandom random = ThreadLocalRandom.current();
+            DynamicSliceOutput out = new DynamicSliceOutput(length * 4);
+            for (int index = 0; index < length; index++) {
+                int codePoint = codePointSet[random.nextInt(codePointSet.length)];
+                out.appendBytes(new String(Character.toChars(codePoint)).getBytes(StandardCharsets.UTF_8));
+            }
+
+            byte[] encoded = out.slice().getBytes();
+            offset = 9;
+            utf8 = new byte[offset + encoded.length + 3];
+            System.arraycopy(encoded, 0, utf8, offset, encoded.length);
+            byteLength = encoded.length;
+            padString = Slices.wrappedBuffer(utf8, offset, byteLength);
+        }
+
+        public byte[] getUtf8()
+        {
+            return utf8;
+        }
+
+        public int getOffset()
+        {
+            return offset;
+        }
+
+        public int getByteLength()
+        {
+            return byteLength;
+        }
+
+        public Slice getPadString()
+        {
+            return padString;
+        }
+    }
+
+    @State(Thread)
+    public static class TrinoPrefixRangeData
+    {
+        @Param("256")
+        private int length;
+
+        @Param({"true", "false"})
+        private boolean ascii;
+
+        private byte[] utf8;
+        private int offset;
+        private int byteLength;
+        private Slice constantPrefix;
+
+        @Setup
+        public void setup()
+        {
+            int[] codePointSet = ascii ? BenchmarkData.ASCII_CODE_POINTS : BenchmarkData.ALL_CODE_POINTS;
+            ThreadLocalRandom random = ThreadLocalRandom.current();
+
+            int[] codePoints = new int[length];
+            codePoints[0] = 'a';
+            for (int index = 1; index < codePoints.length; index++) {
+                codePoints[index] = codePointSet[random.nextInt(codePointSet.length)];
+            }
+
+            DynamicSliceOutput out = new DynamicSliceOutput(length * 4);
+            for (int codePoint : codePoints) {
+                out.appendBytes(new String(Character.toChars(codePoint)).getBytes(StandardCharsets.UTF_8));
+            }
+
+            byte[] encoded = out.slice().getBytes();
+            offset = 13;
+            utf8 = new byte[offset + encoded.length + 5];
+            System.arraycopy(encoded, 0, utf8, offset, encoded.length);
+            byteLength = encoded.length;
+            constantPrefix = Slices.wrappedBuffer(utf8, offset, byteLength);
+        }
+
+        public byte[] getUtf8()
+        {
+            return utf8;
+        }
+
+        public int getOffset()
+        {
+            return offset;
+        }
+
+        public int getByteLength()
+        {
+            return byteLength;
+        }
+
+        public Slice getConstantPrefix()
+        {
+            return constantPrefix;
+        }
+    }
+
     @State(Thread)
     public static class CodePointWriteData
     {
diff --git a/src/test/java/io/airlift/slice/TestSliceUtf8.java b/src/test/java/io/airlift/slice/TestSliceUtf8.java
@@ -25,6 +25,7 @@
 import java.util.stream.IntStream;
 
 import static com.google.common.primitives.Bytes.concat;
+import static io.airlift.slice.SliceUtf8.codePointByteLengths;
 import static io.airlift.slice.SliceUtf8.codePointToUtf8;
 import static io.airlift.slice.SliceUtf8.compareUtf16BE;
 import static io.airlift.slice.SliceUtf8.countCodePoints;
@@ -266,6 +267,7 @@ public void testByteArrayOverloadsMatchSlice()
         assertThat(wrappedBuffer(byteArrayTarget, 0, arrayWritten)).isEqualTo(sliceTarget.slice(0, sliceWritten));
 
         assertThat(toCodePoints(padded, offset, length)).isEqualTo(toCodePoints(view));
+        assertThat(codePointByteLengths(padded, offset, length)).isEqualTo(codePointByteLengths(view));
         assertThat(fromCodePoints(toCodePoints(view))).isEqualTo(view);
     }
 
@@ -337,6 +339,30 @@ public void testToCodePointsInvalidUtf8()
                 .hasMessageContaining("Invalid UTF-8 sequence at position");
     }
 
+    @Test
+    public void testCodePointByteLengths()
+    {
+        assertCodePointByteLengths(STRING_EMPTY);
+        assertCodePointByteLengths(STRING_HELLO);
+        assertCodePointByteLengths(STRING_OESTERREICH);
+        assertCodePointByteLengths(STRING_DULIOE_DULIOE);
+        assertCodePointByteLengths(STRING_FAITH_HOPE_LOVE);
+        assertCodePointByteLengths(STRING_OO);
+        assertCodePointByteLengths(STRING_ASCII_CODE_POINTS);
+        assertCodePointByteLengths(STRING_ALL_CODE_POINTS_RANDOM);
+    }
+
+    private static void assertCodePointByteLengths(String value)
+    {
+        Slice utf8 = utf8Slice(value);
+        int[] codePoints = value.codePoints().toArray();
+        byte[] expectedLengths = new byte[codePoints.length];
+        for (int index = 0; index < codePoints.length; index++) {
+            expectedLengths[index] = (byte) lengthOfCodePoint(codePoints[index]);
+        }
+        assertThat(codePointByteLengths(utf8)).isEqualTo(expectedLengths);
+    }
+
     @Test
     public void testFromCodePointsInvalid()
     {