Skip to content

Commit 0dc5a23

Browse files
Calculate text string length correctly for code points outside BMP (#132593) (#132598)
Strings parsed with the optimized UTF8 parsing have their length calculated during parsing. This length should be the same as the length if the string is parsed with the non-optimized path. Specifically, characters outside the basic multilingual plane require 2 chars per code point in the UTF16 encoding. (cherry picked from commit fa6e905)
1 parent 7768974 commit 0dc5a23

File tree

3 files changed

+47
-8
lines changed

3 files changed

+47
-8
lines changed

docs/changelog/132593.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 132593
2+
summary: Strings outside BMP have 2 chars per code points
3+
area: Mapping
4+
type: bug
5+
issues: []

libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ protected Text _finishAndReturnText() throws IOException {
112112
return null;
113113
}
114114
ptr += bytesToSkip;
115-
++stringLength;
115+
// Code points that require 4 bytes in UTF-8 will use 2 chars in UTF-16.
116+
stringLength += (bytesToSkip == 4 ? 2 : 1);
116117
}
117118
default -> {
118119
return null;

libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,18 @@ public void testGetValueAsText() throws IOException {
7676
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
7777
});
7878

79+
testParseJson("{\"foo\": \"\uD83D\uDE0A\"}", parser -> {
80+
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
81+
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
82+
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
83+
84+
var text = parser.getValueAsText();
85+
assertThat(text, Matchers.notNullValue());
86+
var bytes = text.bytes();
87+
assertTextRef(bytes, "\uD83D\uDE0A");
88+
assertThat(text.stringLength(), Matchers.equalTo(2));
89+
});
90+
7991
testParseJson("{\"foo\": \"bår\"}", parser -> {
8092
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
8193
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
@@ -143,19 +155,37 @@ private record TestInput(String input, String result, boolean supportsOptimized)
143155
new TestInput("\\/", "/", true),
144156
new TestInput("\\\\", "\\", true) };
145157

146-
private int randomCodepoint(boolean includeAscii) {
158+
private int randomCodepointIncludeAscii() {
147159
while (true) {
148160
char val = Character.toChars(randomInt(0xFFFF))[0];
149-
if (val <= 0x7f && includeAscii == false) {
150-
continue;
151-
}
152161
if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) {
153162
continue;
154163
}
155164
return val;
156165
}
157166
}
158167

168+
private int randomCodepointIncludeOutsideBMP(int remainingLength) {
169+
while (true) {
170+
int codePoint = randomInt(0x10FFFF);
171+
char[] val = Character.toChars(codePoint);
172+
// Don't include ascii
173+
if (val.length == 1 && val[0] <= 0x7F) {
174+
continue;
175+
}
176+
boolean surrogate = val[0] >= Character.MIN_SURROGATE && val[0] <= Character.MAX_SURROGATE;
177+
// Single surrogate is invalid
178+
if (val.length == 1 && surrogate) {
179+
continue;
180+
}
181+
// Not enough remaining space for a surrogate pair
182+
if (remainingLength < 2 && surrogate) {
183+
continue;
184+
}
185+
return codePoint;
186+
}
187+
}
188+
159189
private TestInput buildRandomInput(int length) {
160190
StringBuilder input = new StringBuilder(length);
161191
StringBuilder result = new StringBuilder(length);
@@ -171,13 +201,14 @@ private TestInput buildRandomInput(int length) {
171201
doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized();
172202
}
173203
case 1 -> {
174-
int value = randomCodepoint(true);
204+
int value = randomCodepointIncludeAscii();
175205
input.append(String.format(Locale.ENGLISH, "\\u%04x", value));
176206
result.append(Character.toChars(value));
177207
doesSupportOptimized = false;
178208
}
179209
default -> {
180-
var value = Character.toChars(randomCodepoint(false));
210+
var remainingLength = length - i;
211+
var value = Character.toChars(randomCodepointIncludeOutsideBMP(remainingLength));
181212
input.append(value);
182213
result.append(value);
183214
}
@@ -222,7 +253,9 @@ public void testGetValueRandomized() throws IOException {
222253

223254
String currVal = inputs[i].result();
224255
if (inputs[i].supportsOptimized()) {
225-
assertTextRef(parser.getValueAsText().bytes(), currVal);
256+
var text = parser.getValueAsText();
257+
assertTextRef(text.bytes(), currVal);
258+
assertThat(text.stringLength(), Matchers.equalTo(currVal.length()));
226259
} else {
227260
assertThat(parser.getValueAsText(), Matchers.nullValue());
228261
assertThat(parser.getValueAsString(), Matchers.equalTo(currVal));

0 commit comments

Comments
 (0)