openjdk · cushon · Nov 9, 2025
diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java
@@ -1438,6 +1438,26 @@ private static <E extends Exception> byte[] encodeUTF8(byte coder, byte[] val, C
         return Arrays.copyOf(dst, dp);
     }
 
+    private static int encodedLengthUTF8(byte coder, byte[] val) {
+        if (coder == UTF16) {
+            return encodedLengthUTF8_UTF16(val);
+        }
+        int positives = StringCoding.countPositives(val, 0, val.length);
+        if (positives == val.length) {
+            return positives;
+        }
+        int dp = positives;
+        for (int i = dp; i < val.length; i++) {
+            byte c = val[i];
+            if (c < 0) {
+                dp += 2;
+            } else {
+                dp++;
+            }
+        }
+        return dp;
+    }
+
     /**
      * {@return the byte array obtained by first decoding {@code val} with
      * UTF-16, and then encoding the result with UTF-8}
@@ -1509,6 +1529,46 @@ private static <E extends Exception> byte[] encodeUTF8_UTF16(byte[] val, Class<E
         return Arrays.copyOf(dst, dp);
     }
 
+    private static int encodedLengthUTF8_UTF16(byte[] val) {
+        int dp = 0;
+        int sp = 0;
+        int sl = val.length >> 1;
+        while (sp < sl) {
+            // ascii fast loop;
+            char c = StringUTF16.getChar(val, sp);
+            if (c >= '\u0080') {
+                break;
+            }
+            dp++;
+            sp++;
+        }
+        while (sp < sl) {
+            char c = StringUTF16.getChar(val, sp++);
+            if (c < 0x80) {
+                dp++;
+            } else if (c < 0x800) {
+                dp += 2;
+            } else if (Character.isSurrogate(c)) {
+                int uc = -1;
+                char c2;
+                if (Character.isHighSurrogate(c) && sp < sl &&
+                        Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
+                    uc = Character.toCodePoint(c, c2);
+                }
+                if (uc < 0) {
+                    dp++;
+                } else {
+                    dp += 4;
+                    sp++;  // 2 chars
+                }
+            } else {
+                // 3 bytes, 16 bits
+                dp += 3;
+            }
+        }
+        return dp;
+    }
+
     /**
      * {@return the exact size required to UTF_8 encode this UTF16 string}
      *
@@ -2016,6 +2076,24 @@ public byte[] getBytes() {
         return encode(Charset.defaultCharset(), coder(), value);
     }
 
+    /**
+     * Returns the length in bytes of the given String encoded with the given {@link Charset}.
+     *
+     * <p>The result will be the same value as {@code getBytes(charset).length}.
+     *
+     * @param cs the charset used to the compute the length
+     * @return length in bytes of the string
+     */
+    public int getBytesLength(Charset cs) {
+        if (cs == UTF_8.INSTANCE) {
+            return encodedLengthUTF8(coder, value);
+        }
+        if (bytesCompatible(cs)) {
+            return value.length;
+        }
+        return getBytes(cs).length;
+    }
+
     boolean bytesCompatible(Charset charset) {
         if (isLatin1()) {
             if (charset == ISO_8859_1.INSTANCE) {
@@ -5117,5 +5195,4 @@ public Optional<String> describeConstable() {
     public String resolveConstantDesc(MethodHandles.Lookup lookup) {
         return this;
     }
-
 }
diff --git a/test/jdk/java/lang/String/Encodings.java b/test/jdk/java/lang/String/Encodings.java
@@ -106,6 +106,9 @@ static void go(String enc, String str, final byte[] bytes, boolean bidir)
         if (!equals(bs, bytes))
             throw new Exception(charset + ": String.getBytes failed");
 
+        if (bs.length != str.getBytesLength(charset))
+            throw new Exception(charset + ": String.getBytesLength failed");
+
         // Calls to String.getBytes(Charset) shouldn't automatically
         // use the cached thread-local encoder.
         if (charset.name().equals("UTF-16BE")) {

diff --git a/test/jdk/sun/nio/cs/TestStringCoding.java b/test/jdk/sun/nio/cs/TestStringCoding.java
@@ -169,6 +169,12 @@ static byte[] testGetBytes(Charset cs, String str) throws Throwable {
         if (!Arrays.equals(baSC, baNIO)) {
             throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());
         }
+        //getBytesLength(cs);
+        int getBytesLength = str.getBytesLength(cs);
+        if (baSC.length != getBytesLength) {
+            throw new RuntimeException(String.format("getBytesLength failed (%d != %d) -> %s",
+                    baSC.length, getBytesLength, cs.name()));
+        }
         return baSC;
     }
 

diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java b/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2025, Google LLC. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.java.lang.foreign;
+
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeUnit;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.annotations.State;
+
+@Warmup(time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@State(Scope.Benchmark)
+public class StringLoopJmhBenchmark {
+  @Param({"10", "100", "1000", "100000"})
+  int stringLength;
+
+  @Param({"ASCII", "LATIN1", "UTF16"})
+  String encoding;
+
+  String stringData;
+
+  @Setup
+  public void setUp() {
+    stringData = "";
+
+    // Character at the _end_ to affect if we hit
+    // - ASCII = compact strings and compatible with UTF-8
+    // - LATIN1 = compact strings but not compatible with UTF-8
+    // - UTF16 = 2-byte char storage and not compatible with UTF-8
+    String c;
+    if (encoding.equals("ASCII")) {
+      c = "a";
+    } else if (encoding.equals("LATIN1")) {
+      c = "\u00C4";
+    } else if (encoding.equals("UTF16")) {
+      c = "\u2603";
+    } else {
+      throw new IllegalArgumentException("Unknown encoding: " + encoding);
+    }
+
+    while (stringData.length() < stringLength) {
+      stringData += (char) (Math.random() * 26) + 'a';
+    }
+    stringData += c;
+  }
+
+  @Benchmark
+  public int utf8LenByLoop() {
+    final String s = stringData;
+    final int len = s.length();
+
+    // ASCII prefix strings.
+    int idx = 0;
+    for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {}
+
+    // Entire string was ASCII.
+    if (idx == len) {
+      return len;
+    }
+
+    int utf8Len = len;
+    for (char c; idx < len; ++idx) {
+      c = s.charAt(idx);
+      if (c < 0x80) {
+        utf8Len++;
+      } else if (c < 0x800) {
+        utf8Len += 2;
+      } else {
+        utf8Len += 3;
+        if (Character.isSurrogate(c)) {
+          int cp = Character.codePointAt(s, idx);
+          if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+            throw new RuntimeException("Unpaired surrogate");
+          }
+          idx++;
+        }
+      }
+    }
+    return utf8Len;
+  }
+
+  @Benchmark
+  public int getBytes() throws Exception {
+    return stringData.getBytes(StandardCharsets.UTF_8).length;
+  }
+
+  @Benchmark
+  public int getBytesLength() throws Exception {
+    return stringData.getBytesLength(StandardCharsets.UTF_8);
+  }
+}