Skip to content

Commit a81e90f

Browse files
authored
Merge pull request #60 from luneo7/JDK-8314794-backport
[Backport] svm: Adopt "JDK-8314794: Improve UTF8 String supports" [GR-58535]
2 parents db24370 + 77c9f98 commit a81e90f

File tree

2 files changed

+53
-24
lines changed
  • substratevm/src
    • com.oracle.objectfile/src/com/oracle/objectfile/io
    • com.oracle.svm.core/src/com/oracle/svm/core/util

2 files changed

+53
-24
lines changed

substratevm/src/com.oracle.objectfile/src/com/oracle/objectfile/io/Utf8.java

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,22 @@ public final class Utf8 {
3636
private Utf8() {
3737
}
3838

39+
private static int utf8Size(char c) {
40+
// Based On
41+
// https://github.com/openjdk/jdk21u/blob/jdk-21.0.4%2B7/src/hotspot/share/utilities/utf8.cpp#L409-L418
42+
if ((0x0001 <= c) && (c <= 0x007F)) {
43+
// ASCII character
44+
return 1;
45+
} else if (c <= 0x07FF) {
46+
return 2;
47+
} else {
48+
return 3;
49+
}
50+
}
51+
3952
/**
40-
* @return the length in bytes of the UTF8 representation of the string
53+
* @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
54+
* return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
4155
*/
4256
public static int utf8Length(String string) {
4357
return utf8Length(string, 0, string.length());
@@ -46,24 +60,26 @@ public static int utf8Length(String string) {
4660
/**
4761
* @param beginIndex first index that is part of the region, inclusive
4862
* @param endIndex index at the end of the region, exclusive
49-
* @return the length in bytes of the UTF8 representation of the string region
63+
* @return the length as {@code int} in bytes of the UTF8 representation of the string region.
64+
* Might return a truncated size if the value does not fit into {@code int} (see
65+
* JDK-8328877).
5066
*/
5167
public static int utf8Length(String s, int beginIndex, int endIndex) {
68+
// Based on
69+
// https://github.com/openjdk/jdk21u/blob/jdk-21.0.4%2B7/src/hotspot/share/utilities/utf8.cpp#L433-L444
5270
if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) {
5371
throw new StringIndexOutOfBoundsException();
5472
}
55-
int length = 0;
56-
for (int i = beginIndex; i < endIndex; i++) {
57-
final int c = s.charAt(i);
58-
if ((c >= 0x0001) && (c <= 0x007F)) {
59-
length++;
60-
} else if (c > 0x07FF) {
61-
length += 3;
62-
} else {
63-
length += 2;
73+
long result = 0;
74+
for (int index = beginIndex; index < endIndex; index++) {
75+
char c = s.charAt(index);
76+
long sz = utf8Size(c);
77+
if (result + sz > Integer.MAX_VALUE - 1) {
78+
break;
6479
}
80+
result += sz;
6581
}
66-
return length;
82+
return (int) result;
6783
}
6884

6985
/**

substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/util/Utf8.java

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,21 @@ public final class Utf8 {
3939
private Utf8() {
4040
}
4141

42+
// @BasedOnJDKFile("https://github.com/openjdk/jdk21u/blob/jdk-21.0.4%2B7/src/hotspot/share/utilities/utf8.cpp#L409-L418")
43+
private static int utf8Size(char c) {
44+
if ((0x0001 <= c) && (c <= 0x007F)) {
45+
// ASCII character
46+
return 1;
47+
} else if (c <= 0x07FF) {
48+
return 2;
49+
} else {
50+
return 3;
51+
}
52+
}
53+
4254
/**
43-
* @return the length in bytes of the UTF8 representation of the string
55+
* @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
56+
* return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
4457
*/
4558
public static int utf8Length(String string) {
4659
return utf8Length(string, 0, string.length());
@@ -49,24 +62,24 @@ public static int utf8Length(String string) {
4962
/**
5063
* @param beginIndex first index that is part of the region, inclusive
5164
* @param endIndex index at the end of the region, exclusive
52-
* @return the length in bytes of the UTF8 representation of the string region
65+
* @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
66+
* return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
5367
*/
68+
// @BasedOnJDKFile("https://github.com/openjdk/jdk21u/blob/jdk-21.0.4%2B7/src/hotspot/share/utilities/utf8.cpp#L433-L444")
5469
public static int utf8Length(String s, int beginIndex, int endIndex) {
5570
if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) {
5671
throw new StringIndexOutOfBoundsException();
5772
}
58-
int length = 0;
59-
for (int i = beginIndex; i < endIndex; i++) {
60-
final int c = s.charAt(i);
61-
if ((c >= 0x0001) && (c <= 0x007F)) {
62-
length++;
63-
} else if (c > 0x07FF) {
64-
length += 3;
65-
} else {
66-
length += 2;
73+
long result = 0;
74+
for (int index = beginIndex; index < endIndex; index++) {
75+
char c = s.charAt(index);
76+
long sz = utf8Size(c);
77+
if (result + sz > Integer.MAX_VALUE - 1) {
78+
break;
6779
}
80+
result += sz;
6881
}
69-
return length;
82+
return (int) result;
7083
}
7184

7285
/**

0 commit comments

Comments
 (0)