diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 52f908c9e98bf..8ebb0fc432886 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -1438,6 +1438,26 @@ private static byte[] encodeUTF8(byte coder, byte[] val, C return Arrays.copyOf(dst, dp); } + private static int encodedLengthUTF8(byte coder, byte[] val) { + if (coder == UTF16) { + return encodedLengthUTF8_UTF16(val); + } + int positives = StringCoding.countPositives(val, 0, val.length); + if (positives == val.length) { + return positives; + } + int dp = positives; + for (int i = dp; i < val.length; i++) { + byte c = val[i]; + if (c < 0) { + dp += 2; + } else { + dp++; + } + } + return dp; + } + /** * {@return the byte array obtained by first decoding {@code val} with * UTF-16, and then encoding the result with UTF-8} @@ -1509,6 +1529,46 @@ private static byte[] encodeUTF8_UTF16(byte[] val, Class> 1; + while (sp < sl) { + // ascii fast loop; + char c = StringUTF16.getChar(val, sp); + if (c >= '\u0080') { + break; + } + dp++; + sp++; + } + while (sp < sl) { + char c = StringUTF16.getChar(val, sp++); + if (c < 0x80) { + dp++; + } else if (c < 0x800) { + dp += 2; + } else if (Character.isSurrogate(c)) { + int uc = -1; + char c2; + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { + uc = Character.toCodePoint(c, c2); + } + if (uc < 0) { + dp++; + } else { + dp += 4; + sp++; // 2 chars + } + } else { + // 3 bytes, 16 bits + dp += 3; + } + } + return dp; + } + /** * {@return the exact size required to UTF_8 encode this UTF16 string} * @@ -2016,6 +2076,24 @@ public byte[] getBytes() { return encode(Charset.defaultCharset(), coder(), value); } + /** + * Returns the length in bytes of the given String encoded with the given {@link Charset}. + * + *

The result will be the same value as {@code getBytes(charset).length}. + * + * @param cs the charset used to the compute the length + * @return length in bytes of the string + */ + public int getBytesLength(Charset cs) { + if (cs == UTF_8.INSTANCE) { + return encodedLengthUTF8(coder, value); + } + if (bytesCompatible(cs)) { + return value.length; + } + return getBytes(cs).length; + } + boolean bytesCompatible(Charset charset) { if (isLatin1()) { if (charset == ISO_8859_1.INSTANCE) { @@ -5117,5 +5195,4 @@ public Optional describeConstable() { public String resolveConstantDesc(MethodHandles.Lookup lookup) { return this; } - } diff --git a/test/jdk/java/lang/String/Encodings.java b/test/jdk/java/lang/String/Encodings.java index 4714815026ed4..ed45f10bbfd1c 100644 --- a/test/jdk/java/lang/String/Encodings.java +++ b/test/jdk/java/lang/String/Encodings.java @@ -106,6 +106,9 @@ static void go(String enc, String str, final byte[] bytes, boolean bidir) if (!equals(bs, bytes)) throw new Exception(charset + ": String.getBytes failed"); + if (bs.length != str.getBytesLength(charset)) + throw new Exception(charset + ": String.getBytesLength failed"); + // Calls to String.getBytes(Charset) shouldn't automatically // use the cached thread-local encoder. if (charset.name().equals("UTF-16BE")) { diff --git a/test/jdk/sun/nio/cs/TestStringCoding.java b/test/jdk/sun/nio/cs/TestStringCoding.java index d708ef180a238..d4726def9d119 100644 --- a/test/jdk/sun/nio/cs/TestStringCoding.java +++ b/test/jdk/sun/nio/cs/TestStringCoding.java @@ -169,6 +169,12 @@ static byte[] testGetBytes(Charset cs, String str) throws Throwable { if (!Arrays.equals(baSC, baNIO)) { throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); } + //getBytesLength(cs); + int getBytesLength = str.getBytesLength(cs); + if (baSC.length != getBytesLength) { + throw new RuntimeException(String.format("getBytesLength failed (%d != %d) -> %s", + baSC.length, getBytesLength, cs.name())); + } return baSC; } diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java b/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java new file mode 100644 index 0000000000000..c9ab5a06b924e --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2025, Google LLC. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang.foreign; + +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.annotations.State; + +@Warmup(time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@State(Scope.Benchmark) +public class StringLoopJmhBenchmark { + @Param({"10", "100", "1000", "100000"}) + int stringLength; + + @Param({"ASCII", "LATIN1", "UTF16"}) + String encoding; + + String stringData; + + @Setup + public void setUp() { + stringData = ""; + + // Character at the _end_ to affect if we hit + // - ASCII = compact strings and compatible with UTF-8 + // - LATIN1 = compact strings but not compatible with UTF-8 + // - UTF16 = 2-byte char storage and not compatible with UTF-8 + String c; + if (encoding.equals("ASCII")) { + c = "a"; + } else if (encoding.equals("LATIN1")) { + c = "\u00C4"; + } else if (encoding.equals("UTF16")) { + c = "\u2603"; + } else { + throw new IllegalArgumentException("Unknown encoding: " + encoding); + } + + while (stringData.length() < stringLength) { + stringData += (char) (Math.random() * 26) + 'a'; + } + stringData += c; + } + + @Benchmark + public int utf8LenByLoop() { + final String s = stringData; + final int len = s.length(); + + // ASCII prefix strings. + int idx = 0; + for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {} + + // Entire string was ASCII. + if (idx == len) { + return len; + } + + int utf8Len = len; + for (char c; idx < len; ++idx) { + c = s.charAt(idx); + if (c < 0x80) { + utf8Len++; + } else if (c < 0x800) { + utf8Len += 2; + } else { + utf8Len += 3; + if (Character.isSurrogate(c)) { + int cp = Character.codePointAt(s, idx); + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + throw new RuntimeException("Unpaired surrogate"); + } + idx++; + } + } + } + return utf8Len; + } + + @Benchmark + public int getBytes() throws Exception { + return stringData.getBytes(StandardCharsets.UTF_8).length; + } + + @Benchmark + public int getBytesLength() throws Exception { + return stringData.getBytesLength(StandardCharsets.UTF_8); + } +}