@@ -70,164 +70,243 @@ protected Text _finishAndReturnText() throws IOException {
7070 ptr = _inputPtr ;
7171 }
7272
73- final int startPtr = ptr ;
74- final byte [] inputBuffer = _inputBuffer ;
73+ int startPtr = ptr ;
74+ final int [] codes = INPUT_CODES_UTF8 ;
7575 final int max = _inputEnd ;
76- int codePointCount = 0 ;
76+ final byte [] inputBuffer = _inputBuffer ;
77+ stringLength = 0 ;
78+ backslashes .clear ();
7779
78- // Fast path: scan for quote or backslash first, counting code points as we go
79- while (ptr < max ) {
80- byte b = inputBuffer [ptr ];
81- if (b == INT_QUOTE ) {
82- // Found end quote - string has no escapes
83- int byteLength = ptr - startPtr ;
84- stringLength = codePointCount ;
85- stringEnd = ptr + 1 ;
86- return new Text (new XContentString .UTF8Bytes (inputBuffer , startPtr , byteLength ), codePointCount );
87- }
88- if (b == INT_BACKSLASH ) {
89- // Found escape - switch to escape handling
90- break ;
80+ loop : while (true ) {
81+ if (ptr >= max ) {
82+ return null ;
9183 }
92- // For bytes < 128 (ASCII), we can skip the codes table lookup
93- if (b >= 0 ) {
94- codePointCount ++;
95- ptr ++;
96- } else {
97- // Non-ASCII handling...
98- int c = b & 0xFF ;
99- int codeType = INPUT_CODES_UTF8 [c ];
100- if (codeType == 0 ) {
101- codePointCount ++;
102- ptr ++;
103- } else if (codeType >= 2 && codeType <= 4 ) {
104- if (ptr + codeType > max ) {
84+ int c = inputBuffer [ptr ] & 0xFF ;
85+ switch (codes [c ]) {
86+ case 0 -> {
87+ ++ptr ;
88+ ++stringLength ;
89+ }
90+ case 1 -> {
91+ if (c == INT_QUOTE ) {
92+ // End of the string
93+ break loop ;
94+ }
95+ assert c == INT_BACKSLASH ;
96+ backslashes .add (ptr );
97+ ++ptr ;
98+ if (ptr >= max ) {
99+ // Backslash at end of file
105100 return null ;
106101 }
107- // For 4-byte UTF-8 sequences (surrogate pairs in UTF-16)
108- if (codeType == 4 ) {
109- // Count as 2 UTF-16 code units
110- codePointCount += 2 ;
102+ c = inputBuffer [ ptr ] & 0xFF ;
103+ if (c == '"' || c == '/' || c == '\\' ) {
104+ ptr += 1 ;
105+ stringLength += 1 ;
111106 } else {
112- // 2-byte and 3-byte sequences = 1 UTF-16 code unit
113- codePointCount ++;
107+ // Any other escaped sequence requires replacing the sequence with
108+ // a new character, which we don't support in the optimized path
109+ return null ;
114110 }
115- ptr += codeType ;
116- } else {
117- return null ;
118- }
119- }
120- }
121-
122- // Escape handling path - continue counting code points during the scan
123- if (ptr >= max ) {
124- return null ;
125- }
126-
127- int [] escapePositions = new int [16 ]; // Small initial size
128-
129- int escapeCount = 0 ;
130- int scanPtr = ptr ;
131-
132- // Scan to find escapes and end quote, continuing to count code points
133- while (scanPtr < max ) {
134- byte b = inputBuffer [scanPtr ];
135- if (b == INT_QUOTE ) {
136- break ; // Found end
137- }
138-
139- if (b == INT_BACKSLASH ) {
140- // Grow array if needed
141- if (escapeCount >= escapePositions .length ) {
142- int [] newArray = new int [escapePositions .length * 2 ];
143- System .arraycopy (escapePositions , 0 , newArray , 0 , escapeCount );
144- escapePositions = newArray ;
145- }
146- escapePositions [escapeCount ++] = scanPtr ;
147-
148- scanPtr ++;
149- if (scanPtr >= max ) {
150- return null ;
151111 }
152- b = inputBuffer [scanPtr ];
153- if (b == '"' || b == '/' || b == '\\' ) {
154- codePointCount ++; // The escaped character counts as 1 code point
155- scanPtr ++;
156- } else {
157- return null ; // Unsupported escape
158- }
159- } else if (b >= 0 ) {
160- codePointCount ++;
161- scanPtr ++;
162- } else {
163- // Non-ASCII
164- int c = b & 0xFF ;
165- int codeType = INPUT_CODES_UTF8 [c ];
166- if (codeType == 0 ) {
167- codePointCount ++;
168- scanPtr ++;
169- } else if (codeType >= 2 && codeType <= 4 ) {
170- if (scanPtr + codeType > max ) {
112+ case 2 , 3 , 4 -> {
113+ int bytesToSkip = codes [c ];
114+ if (ptr + bytesToSkip > max ) {
171115 return null ;
172116 }
173- codePointCount ++;
174- scanPtr += codeType ;
175- } else {
117+ ptr += bytesToSkip ;
118+ // Code points that require 4 bytes in UTF-8 will use 2 chars in UTF-16.
119+ stringLength += (bytesToSkip == 4 ? 2 : 1 );
120+ }
121+ default -> {
176122 return null ;
177123 }
178124 }
179125 }
180126
181- if (scanPtr >= max ) {
182- return null ; // Didn't find closing quote
183- }
184-
185- stringEnd = scanPtr + 1 ;
186-
187- // Calculate exact byte size: total bytes minus number of backslashes
188- int exactByteSize = (scanPtr - startPtr ) - escapeCount ;
189-
190- // Allocate exact size buffer
191- byte [] resultBuffer = new byte [exactByteSize ];
192- int writePos = 0 ;
193-
194- // Copy everything before the first backslash
195- int beforeEscapeLength = ptr - startPtr ;
196- System .arraycopy (inputBuffer , startPtr , resultBuffer , 0 , beforeEscapeLength );
197- writePos = beforeEscapeLength ;
198-
199- // Second pass: process escapes (we already have the correct code point count)
200- while (ptr < scanPtr ) {
201- byte b = inputBuffer [ptr ];
202-
203- if (b == INT_BACKSLASH ) {
204- ptr ++; // Skip backslash
205- b = inputBuffer [ptr ]; // Get escaped character
206- resultBuffer [writePos ++] = b ;
207- ptr ++;
208- } else if (b >= 0 ) {
209- // ASCII
210- resultBuffer [writePos ++] = b ;
211- ptr ++;
212- } else {
213- // Non-ASCII - copy multi-byte sequence
214- int c = b & 0xFF ;
215- int codeType = INPUT_CODES_UTF8 [c ];
216- if (codeType == 0 ) {
217- resultBuffer [writePos ++] = b ;
218- ptr ++;
219- } else if (codeType >= 2 && codeType <= 4 ) {
220- System .arraycopy (inputBuffer , ptr , resultBuffer , writePos , codeType );
221- writePos += codeType ;
222- ptr += codeType ;
223- }
127+ stringEnd = ptr + 1 ;
128+ if (backslashes .isEmpty ()) {
129+ return new Text (new XContentString .UTF8Bytes (inputBuffer , startPtr , ptr - startPtr ), stringLength );
130+ } else {
131+ byte [] buff = new byte [ptr - startPtr - backslashes .size ()];
132+ int copyPtr = startPtr ;
133+ int destPtr = 0 ;
134+ for (Integer backslash : backslashes ) {
135+ int length = backslash - copyPtr ;
136+ System .arraycopy (inputBuffer , copyPtr , buff , destPtr , length );
137+ destPtr += length ;
138+ copyPtr = backslash + 1 ;
224139 }
140+ System .arraycopy (inputBuffer , copyPtr , buff , destPtr , ptr - copyPtr );
141+ return new Text (new XContentString .UTF8Bytes (buff ), stringLength );
225142 }
226-
227- stringLength = codePointCount ;
228- return new Text (new XContentString .UTF8Bytes (resultBuffer ), codePointCount );
229143 }
230144
145+ // protected Text _finishAndReturnText() throws IOException {
146+ // int ptr = _inputPtr;
147+ // if (ptr >= _inputEnd) {
148+ // _loadMoreGuaranteed();
149+ // ptr = _inputPtr;
150+ // }
151+ //
152+ // final int startPtr = ptr;
153+ // final byte[] inputBuffer = _inputBuffer;
154+ // final int max = _inputEnd;
155+ // int codePointCount = 0;
156+ //
157+ // // Fast path: scan for quote or backslash first, counting code points as we go
158+ // while (ptr < max) {
159+ // byte b = inputBuffer[ptr];
160+ // if (b == INT_QUOTE) {
161+ // // Found end quote - string has no escapes
162+ // int byteLength = ptr - startPtr;
163+ // stringLength = codePointCount;
164+ // stringEnd = ptr + 1;
165+ // return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, byteLength), codePointCount);
166+ // }
167+ // if (b == INT_BACKSLASH) {
168+ // // Found escape - switch to escape handling
169+ // break;
170+ // }
171+ // // For bytes < 128 (ASCII), we can skip the codes table lookup
172+ // if (b >= 0) {
173+ // codePointCount++;
174+ // ptr++;
175+ // } else {
176+ // // Non-ASCII handling...
177+ // int c = b & 0xFF;
178+ // int codeType = INPUT_CODES_UTF8[c];
179+ // if (codeType == 0) {
180+ // codePointCount++;
181+ // ptr++;
182+ // } else if (codeType >= 2 && codeType <= 4) {
183+ // if (ptr + codeType > max) {
184+ // return null;
185+ // }
186+ // // For 4-byte UTF-8 sequences (surrogate pairs in UTF-16)
187+ // if (codeType == 4) {
188+ // // Count as 2 UTF-16 code units
189+ // codePointCount += 2;
190+ // } else {
191+ // // 2-byte and 3-byte sequences = 1 UTF-16 code unit
192+ // codePointCount++;
193+ // }
194+ // ptr += codeType;
195+ // } else {
196+ // return null;
197+ // }
198+ // }
199+ // }
200+ //
201+ // // Escape handling path - continue counting code points during the scan
202+ // if (ptr >= max) {
203+ // return null;
204+ // }
205+ //
206+ // int[] escapePositions = new int[16]; // Small initial size
207+ //
208+ // int escapeCount = 0;
209+ // int scanPtr = ptr;
210+ //
211+ // // Scan to find escapes and end quote, continuing to count code points
212+ // while (scanPtr < max) {
213+ // byte b = inputBuffer[scanPtr];
214+ // if (b == INT_QUOTE) {
215+ // break; // Found end
216+ // }
217+ //
218+ // if (b == INT_BACKSLASH) {
219+ // // Grow array if needed
220+ // if (escapeCount >= escapePositions.length) {
221+ // int[] newArray = new int[escapePositions.length * 2];
222+ // System.arraycopy(escapePositions, 0, newArray, 0, escapeCount);
223+ // escapePositions = newArray;
224+ // }
225+ // escapePositions[escapeCount++] = scanPtr;
226+ //
227+ // scanPtr++;
228+ // if (scanPtr >= max) {
229+ // return null;
230+ // }
231+ // b = inputBuffer[scanPtr];
232+ // if (b == '"' || b == '/' || b == '\\') {
233+ // codePointCount++; // The escaped character counts as 1 code point
234+ // scanPtr++;
235+ // } else {
236+ // return null; // Unsupported escape
237+ // }
238+ // } else if (b >= 0) {
239+ // codePointCount++;
240+ // scanPtr++;
241+ // } else {
242+ // // Non-ASCII
243+ // int c = b & 0xFF;
244+ // int codeType = INPUT_CODES_UTF8[c];
245+ // if (codeType == 0) {
246+ // codePointCount++;
247+ // scanPtr++;
248+ // } else if (codeType >= 2 && codeType <= 4) {
249+ // if (scanPtr + codeType > max) {
250+ // return null;
251+ // }
252+ // codePointCount++;
253+ // scanPtr += codeType;
254+ // } else {
255+ // return null;
256+ // }
257+ // }
258+ // }
259+ //
260+ // if (scanPtr >= max) {
261+ // return null; // Didn't find closing quote
262+ // }
263+ //
264+ // stringEnd = scanPtr + 1;
265+ //
266+ // // Calculate exact byte size: total bytes minus number of backslashes
267+ // int exactByteSize = (scanPtr - startPtr) - escapeCount;
268+ //
269+ // // Allocate exact size buffer
270+ // byte[] resultBuffer = new byte[exactByteSize];
271+ // int writePos = 0;
272+ //
273+ // // Copy everything before the first backslash
274+ // int beforeEscapeLength = ptr - startPtr;
275+ // System.arraycopy(inputBuffer, startPtr, resultBuffer, 0, beforeEscapeLength);
276+ // writePos = beforeEscapeLength;
277+ //
278+ // // Second pass: process escapes (we already have the correct code point count)
279+ // while (ptr < scanPtr) {
280+ // byte b = inputBuffer[ptr];
281+ //
282+ // if (b == INT_BACKSLASH) {
283+ // ptr++; // Skip backslash
284+ // b = inputBuffer[ptr]; // Get escaped character
285+ // resultBuffer[writePos++] = b;
286+ // ptr++;
287+ // } else if (b >= 0) {
288+ // // ASCII
289+ // resultBuffer[writePos++] = b;
290+ // ptr++;
291+ // } else {
292+ // // Non-ASCII - copy multi-byte sequence
293+ // int c = b & 0xFF;
294+ // int codeType = INPUT_CODES_UTF8[c];
295+ // if (codeType == 0) {
296+ // resultBuffer[writePos++] = b;
297+ // ptr++;
298+ // } else if (codeType >= 2 && codeType <= 4) {
299+ // System.arraycopy(inputBuffer, ptr, resultBuffer, writePos, codeType);
300+ // writePos += codeType;
301+ // ptr += codeType;
302+ // }
303+ // }
304+ // }
305+ //
306+ // stringLength = codePointCount;
307+ // return new Text(new XContentString.UTF8Bytes(resultBuffer), codePointCount);
308+ // }
309+
231310 public boolean writeUTF8TextToStream (OutputStream out ) throws IOException {
232311 if (_currToken == JsonToken .VALUE_STRING && _tokenIncomplete ) {
233312 if (stringEnd > 0 ) {
0 commit comments