Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion src/java.base/share/classes/java/lang/String.java
Original file line number Diff line number Diff line change
Expand Up @@ -1438,6 +1438,26 @@ private static <E extends Exception> byte[] encodeUTF8(byte coder, byte[] val, C
return Arrays.copyOf(dst, dp);
}

private static int encodedLengthUTF8(byte coder, byte[] val) {
if (coder == UTF16) {
return encodedLengthUTF8_UTF16(val);
}
int positives = StringCoding.countPositives(val, 0, val.length);
if (positives == val.length) {
return positives;
}
int dp = positives;
for (int i = dp; i < val.length; i++) {
byte c = val[i];
if (c < 0) {
dp += 2;
} else {
dp++;
}
}
return dp;
}

/**
* {@return the byte array obtained by first decoding {@code val} with
* UTF-16, and then encoding the result with UTF-8}
Expand Down Expand Up @@ -1509,6 +1529,46 @@ private static <E extends Exception> byte[] encodeUTF8_UTF16(byte[] val, Class<E
return Arrays.copyOf(dst, dp);
}

private static int encodedLengthUTF8_UTF16(byte[] val) {
int dp = 0;
int sp = 0;
int sl = val.length >> 1;
while (sp < sl) {
// ascii fast loop;
char c = StringUTF16.getChar(val, sp);
if (c >= '\u0080') {
break;
}
dp++;
sp++;
}
while (sp < sl) {
char c = StringUTF16.getChar(val, sp++);
if (c < 0x80) {
dp++;
} else if (c < 0x800) {
dp += 2;
} else if (Character.isSurrogate(c)) {
int uc = -1;
char c2;
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
uc = Character.toCodePoint(c, c2);
}
if (uc < 0) {
dp++;
} else {
dp += 4;
sp++; // 2 chars
}
} else {
// 3 bytes, 16 bits
dp += 3;
}
}
return dp;
}

/**
* {@return the exact size required to UTF_8 encode this UTF16 string}
*
Expand Down Expand Up @@ -2016,6 +2076,24 @@ public byte[] getBytes() {
return encode(Charset.defaultCharset(), coder(), value);
}

/**
* Returns the length in bytes of the given String encoded with the given {@link Charset}.
*
* <p>The result will be the same value as {@code getBytes(charset).length}.
*
* @param cs the charset used to the compute the length
* @return length in bytes of the string
*/
public int getBytesLength(Charset cs) {
if (cs == UTF_8.INSTANCE) {
return encodedLengthUTF8(coder, value);
}
if (bytesCompatible(cs)) {
return value.length;
}
return getBytes(cs).length;
}

boolean bytesCompatible(Charset charset) {
if (isLatin1()) {
if (charset == ISO_8859_1.INSTANCE) {
Expand Down Expand Up @@ -5117,5 +5195,4 @@ public Optional<String> describeConstable() {
public String resolveConstantDesc(MethodHandles.Lookup lookup) {
return this;
}

}
3 changes: 3 additions & 0 deletions test/jdk/java/lang/String/Encodings.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ static void go(String enc, String str, final byte[] bytes, boolean bidir)
if (!equals(bs, bytes))
throw new Exception(charset + ": String.getBytes failed");

if (bs.length != str.getBytesLength(charset))
throw new Exception(charset + ": String.getBytesLength failed");

// Calls to String.getBytes(Charset) shouldn't automatically
// use the cached thread-local encoder.
if (charset.name().equals("UTF-16BE")) {
Expand Down
6 changes: 6 additions & 0 deletions test/jdk/sun/nio/cs/TestStringCoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ static byte[] testGetBytes(Charset cs, String str) throws Throwable {
if (!Arrays.equals(baSC, baNIO)) {
throw new RuntimeException("getBytes(cs) failed -> " + cs.name());
}
//getBytesLength(cs);
int getBytesLength = str.getBytesLength(cs);
if (baSC.length != getBytesLength) {
throw new RuntimeException(String.format("getBytesLength failed (%d != %d) -> %s",
baSC.length, getBytesLength, cs.name()));
}
return baSC;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Copyright (c) 2025, Google LLC. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang.foreign;

import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.annotations.State;

@Warmup(time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
public class StringLoopJmhBenchmark {
@Param({"10", "100", "1000", "100000"})
int stringLength;

@Param({"ASCII", "LATIN1", "UTF16"})
String encoding;

String stringData;

@Setup
public void setUp() {
stringData = "";

// Character at the _end_ to affect if we hit
// - ASCII = compact strings and compatible with UTF-8
// - LATIN1 = compact strings but not compatible with UTF-8
// - UTF16 = 2-byte char storage and not compatible with UTF-8
String c;
if (encoding.equals("ASCII")) {
c = "a";
} else if (encoding.equals("LATIN1")) {
c = "\u00C4";
} else if (encoding.equals("UTF16")) {
c = "\u2603";
} else {
throw new IllegalArgumentException("Unknown encoding: " + encoding);
}

while (stringData.length() < stringLength) {
stringData += (char) (Math.random() * 26) + 'a';
}
stringData += c;
}

@Benchmark
public int utf8LenByLoop() {
final String s = stringData;
final int len = s.length();

// ASCII prefix strings.
int idx = 0;
for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {}

// Entire string was ASCII.
if (idx == len) {
return len;
}

int utf8Len = len;
for (char c; idx < len; ++idx) {
c = s.charAt(idx);
if (c < 0x80) {
utf8Len++;
} else if (c < 0x800) {
utf8Len += 2;
} else {
utf8Len += 3;
if (Character.isSurrogate(c)) {
int cp = Character.codePointAt(s, idx);
if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
throw new RuntimeException("Unpaired surrogate");
}
idx++;
}
}
}
return utf8Len;
}

@Benchmark
public int getBytes() throws Exception {
return stringData.getBytes(StandardCharsets.UTF_8).length;
}

@Benchmark
public int getBytesLength() throws Exception {
return stringData.getBytesLength(StandardCharsets.UTF_8);
}
}