Skip to content

Commit 1ff4df4

Browse files
committed
Add support for some escape sequences in optimizedText
1 parent 080a280 commit 1ff4df4

File tree

1 file changed

+57
-14
lines changed

1 file changed

+57
-14
lines changed

libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@
2121

2222
import java.io.IOException;
2323
import java.io.InputStream;
24+
import java.util.ArrayList;
25+
import java.util.List;
2426

2527
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
2628
protected int stringEnd = -1;
29+
protected int stringLength;
2730

2831
public ESUTF8StreamJsonParser(
2932
IOContext ctxt,
@@ -49,9 +52,7 @@ public Text getValueAsText() throws IOException {
4952
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
5053
if (stringEnd > 0) {
5154
final int len = stringEnd - 1 - _inputPtr;
52-
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
53-
// which means each character uses exactly 1 byte.
54-
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
55+
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), stringLength);
5556
}
5657
return _finishAndReturnText();
5758
}
@@ -69,21 +70,63 @@ protected Text _finishAndReturnText() throws IOException {
6970
final int[] codes = INPUT_CODES_UTF8;
7071
final int max = _inputEnd;
7172
final byte[] inputBuffer = _inputBuffer;
72-
while (ptr < max) {
73+
stringLength = 0;
74+
List<Integer> backslashes = null;
75+
76+
loop: while (ptr < max) {
7377
int c = inputBuffer[ptr] & 0xFF;
74-
if (codes[c] != 0) {
75-
if (c == INT_QUOTE) {
76-
stringEnd = ptr + 1;
77-
final int len = ptr - startPtr;
78-
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
79-
// which means each character uses exactly 1 byte.
80-
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
78+
switch (codes[c]) {
79+
case 0 -> {
80+
++ptr;
81+
++stringLength;
82+
}
83+
case 1 -> {
84+
if (c == INT_QUOTE) {
85+
// End of the string
86+
break loop;
87+
}
88+
assert c == INT_BACKSLASH;
89+
if (backslashes == null) {
90+
backslashes = new ArrayList<>();
91+
}
92+
backslashes.add(ptr);
93+
++ptr;
94+
if (ptr >= max) {
95+
// Backslash at end of file
96+
return null;
97+
}
98+
c = inputBuffer[ptr] & 0xFF;
99+
if (c == '"' || c == '/' || c == '\\') {
100+
ptr += 1;
101+
stringLength += 1;
102+
} else {
103+
// Any other escaped sequence requires replacing the sequence with
104+
// a new character, which we don't support in the optimized path
105+
return null;
106+
}
107+
}
108+
default -> {
109+
return null;
81110
}
82-
return null;
83111
}
84-
++ptr;
85112
}
86-
return null;
113+
114+
stringEnd = ptr + 1;
115+
if (backslashes == null) {
116+
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
117+
} else {
118+
byte[] buff = new byte[ptr - startPtr - backslashes.size()];
119+
int copyPtr = startPtr;
120+
int destPtr = 0;
121+
for (Integer backslash : backslashes) {
122+
int length = backslash - copyPtr;
123+
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
124+
destPtr += length;
125+
copyPtr = backslash + 1;
126+
}
127+
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
128+
return new Text(new XContentString.UTF8Bytes(buff), stringLength);
129+
}
87130
}
88131

89132
@Override

0 commit comments

Comments
 (0)