2121
2222import java .io .IOException ;
2323import java .io .InputStream ;
24+ import java .util .ArrayList ;
25+ import java .util .List ;
2426
2527public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
2628 protected int stringEnd = -1 ;
29+ protected int stringLength ;
2730
2831 public ESUTF8StreamJsonParser (
2932 IOContext ctxt ,
@@ -49,9 +52,7 @@ public Text getValueAsText() throws IOException {
4952 if (_currToken == JsonToken .VALUE_STRING && _tokenIncomplete ) {
5053 if (stringEnd > 0 ) {
5154 final int len = stringEnd - 1 - _inputPtr ;
52- // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
53- // which means each character uses exactly 1 byte.
54- return new Text (new XContentString .UTF8Bytes (_inputBuffer , _inputPtr , len ), len );
55+ return new Text (new XContentString .UTF8Bytes (_inputBuffer , _inputPtr , len ), stringLength );
5556 }
5657 return _finishAndReturnText ();
5758 }
@@ -69,21 +70,63 @@ protected Text _finishAndReturnText() throws IOException {
6970 final int [] codes = INPUT_CODES_UTF8 ;
7071 final int max = _inputEnd ;
7172 final byte [] inputBuffer = _inputBuffer ;
72- while (ptr < max ) {
73+ stringLength = 0 ;
74+ List <Integer > backslashes = null ;
75+
76+ loop : while (ptr < max ) {
7377 int c = inputBuffer [ptr ] & 0xFF ;
74- if (codes [c ] != 0 ) {
75- if (c == INT_QUOTE ) {
76- stringEnd = ptr + 1 ;
77- final int len = ptr - startPtr ;
78- // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
79- // which means each character uses exactly 1 byte.
80- return new Text (new XContentString .UTF8Bytes (inputBuffer , startPtr , len ), len );
78+ switch (codes [c ]) {
79+ case 0 -> {
80+ ++ptr ;
81+ ++stringLength ;
82+ }
83+ case 1 -> {
84+ if (c == INT_QUOTE ) {
85+ // End of the string
86+ break loop ;
87+ }
88+ assert c == INT_BACKSLASH ;
89+ if (backslashes == null ) {
90+ backslashes = new ArrayList <>();
91+ }
92+ backslashes .add (ptr );
93+ ++ptr ;
94+ if (ptr >= max ) {
95+ // Backslash at end of file
96+ return null ;
97+ }
98+ c = inputBuffer [ptr ] & 0xFF ;
99+ if (c == '"' || c == '/' || c == '\\' ) {
100+ ptr += 1 ;
101+ stringLength += 1 ;
102+ } else {
103+ // Any other escaped sequence requires replacing the sequence with
104+ // a new character, which we don't support in the optimized path
105+ return null ;
106+ }
107+ }
108+ default -> {
109+ return null ;
81110 }
82- return null ;
83111 }
84- ++ptr ;
85112 }
86- return null ;
113+
114+ stringEnd = ptr + 1 ;
115+ if (backslashes == null ) {
116+ return new Text (new XContentString .UTF8Bytes (inputBuffer , startPtr , ptr - startPtr ), stringLength );
117+ } else {
118+ byte [] buff = new byte [ptr - startPtr - backslashes .size ()];
119+ int copyPtr = startPtr ;
120+ int destPtr = 0 ;
121+ for (Integer backslash : backslashes ) {
122+ int length = backslash - copyPtr ;
123+ System .arraycopy (inputBuffer , copyPtr , buff , destPtr , length );
124+ destPtr += length ;
125+ copyPtr = backslash + 1 ;
126+ }
127+ System .arraycopy (inputBuffer , copyPtr , buff , destPtr , ptr - copyPtr );
128+ return new Text (new XContentString .UTF8Bytes (buff ), stringLength );
129+ }
87130 }
88131
89132 @ Override
0 commit comments