Skip to content

Commit ee9eb51

Browse files
committed
Fix
1 parent 98fb570 commit ee9eb51

File tree

2 files changed

+222
-143
lines changed

2 files changed

+222
-143
lines changed

libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java

Lines changed: 218 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -70,164 +70,243 @@ protected Text _finishAndReturnText() throws IOException {
7070
ptr = _inputPtr;
7171
}
7272

73-
final int startPtr = ptr;
74-
final byte[] inputBuffer = _inputBuffer;
73+
int startPtr = ptr;
74+
final int[] codes = INPUT_CODES_UTF8;
7575
final int max = _inputEnd;
76-
int codePointCount = 0;
76+
final byte[] inputBuffer = _inputBuffer;
77+
stringLength = 0;
78+
backslashes.clear();
7779

78-
// Fast path: scan for quote or backslash first, counting code points as we go
79-
while (ptr < max) {
80-
byte b = inputBuffer[ptr];
81-
if (b == INT_QUOTE) {
82-
// Found end quote - string has no escapes
83-
int byteLength = ptr - startPtr;
84-
stringLength = codePointCount;
85-
stringEnd = ptr + 1;
86-
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, byteLength), codePointCount);
87-
}
88-
if (b == INT_BACKSLASH) {
89-
// Found escape - switch to escape handling
90-
break;
80+
loop: while (true) {
81+
if (ptr >= max) {
82+
return null;
9183
}
92-
// For bytes < 128 (ASCII), we can skip the codes table lookup
93-
if (b >= 0) {
94-
codePointCount++;
95-
ptr++;
96-
} else {
97-
// Non-ASCII handling...
98-
int c = b & 0xFF;
99-
int codeType = INPUT_CODES_UTF8[c];
100-
if (codeType == 0) {
101-
codePointCount++;
102-
ptr++;
103-
} else if (codeType >= 2 && codeType <= 4) {
104-
if (ptr + codeType > max) {
84+
int c = inputBuffer[ptr] & 0xFF;
85+
switch (codes[c]) {
86+
case 0 -> {
87+
++ptr;
88+
++stringLength;
89+
}
90+
case 1 -> {
91+
if (c == INT_QUOTE) {
92+
// End of the string
93+
break loop;
94+
}
95+
assert c == INT_BACKSLASH;
96+
backslashes.add(ptr);
97+
++ptr;
98+
if (ptr >= max) {
99+
// Backslash at end of file
105100
return null;
106101
}
107-
// For 4-byte UTF-8 sequences (surrogate pairs in UTF-16)
108-
if (codeType == 4) {
109-
// Count as 2 UTF-16 code units
110-
codePointCount += 2;
102+
c = inputBuffer[ptr] & 0xFF;
103+
if (c == '"' || c == '/' || c == '\\') {
104+
ptr += 1;
105+
stringLength += 1;
111106
} else {
112-
// 2-byte and 3-byte sequences = 1 UTF-16 code unit
113-
codePointCount++;
107+
// Any other escaped sequence requires replacing the sequence with
108+
// a new character, which we don't support in the optimized path
109+
return null;
114110
}
115-
ptr += codeType;
116-
} else {
117-
return null;
118-
}
119-
}
120-
}
121-
122-
// Escape handling path - continue counting code points during the scan
123-
if (ptr >= max) {
124-
return null;
125-
}
126-
127-
int[] escapePositions = new int[16]; // Small initial size
128-
129-
int escapeCount = 0;
130-
int scanPtr = ptr;
131-
132-
// Scan to find escapes and end quote, continuing to count code points
133-
while (scanPtr < max) {
134-
byte b = inputBuffer[scanPtr];
135-
if (b == INT_QUOTE) {
136-
break; // Found end
137-
}
138-
139-
if (b == INT_BACKSLASH) {
140-
// Grow array if needed
141-
if (escapeCount >= escapePositions.length) {
142-
int[] newArray = new int[escapePositions.length * 2];
143-
System.arraycopy(escapePositions, 0, newArray, 0, escapeCount);
144-
escapePositions = newArray;
145-
}
146-
escapePositions[escapeCount++] = scanPtr;
147-
148-
scanPtr++;
149-
if (scanPtr >= max) {
150-
return null;
151111
}
152-
b = inputBuffer[scanPtr];
153-
if (b == '"' || b == '/' || b == '\\') {
154-
codePointCount++; // The escaped character counts as 1 code point
155-
scanPtr++;
156-
} else {
157-
return null; // Unsupported escape
158-
}
159-
} else if (b >= 0) {
160-
codePointCount++;
161-
scanPtr++;
162-
} else {
163-
// Non-ASCII
164-
int c = b & 0xFF;
165-
int codeType = INPUT_CODES_UTF8[c];
166-
if (codeType == 0) {
167-
codePointCount++;
168-
scanPtr++;
169-
} else if (codeType >= 2 && codeType <= 4) {
170-
if (scanPtr + codeType > max) {
112+
case 2, 3, 4 -> {
113+
int bytesToSkip = codes[c];
114+
if (ptr + bytesToSkip > max) {
171115
return null;
172116
}
173-
codePointCount++;
174-
scanPtr += codeType;
175-
} else {
117+
ptr += bytesToSkip;
118+
// Code points that require 4 bytes in UTF-8 will use 2 chars in UTF-16.
119+
stringLength += (bytesToSkip == 4 ? 2 : 1);
120+
}
121+
default -> {
176122
return null;
177123
}
178124
}
179125
}
180126

181-
if (scanPtr >= max) {
182-
return null; // Didn't find closing quote
183-
}
184-
185-
stringEnd = scanPtr + 1;
186-
187-
// Calculate exact byte size: total bytes minus number of backslashes
188-
int exactByteSize = (scanPtr - startPtr) - escapeCount;
189-
190-
// Allocate exact size buffer
191-
byte[] resultBuffer = new byte[exactByteSize];
192-
int writePos = 0;
193-
194-
// Copy everything before the first backslash
195-
int beforeEscapeLength = ptr - startPtr;
196-
System.arraycopy(inputBuffer, startPtr, resultBuffer, 0, beforeEscapeLength);
197-
writePos = beforeEscapeLength;
198-
199-
// Second pass: process escapes (we already have the correct code point count)
200-
while (ptr < scanPtr) {
201-
byte b = inputBuffer[ptr];
202-
203-
if (b == INT_BACKSLASH) {
204-
ptr++; // Skip backslash
205-
b = inputBuffer[ptr]; // Get escaped character
206-
resultBuffer[writePos++] = b;
207-
ptr++;
208-
} else if (b >= 0) {
209-
// ASCII
210-
resultBuffer[writePos++] = b;
211-
ptr++;
212-
} else {
213-
// Non-ASCII - copy multi-byte sequence
214-
int c = b & 0xFF;
215-
int codeType = INPUT_CODES_UTF8[c];
216-
if (codeType == 0) {
217-
resultBuffer[writePos++] = b;
218-
ptr++;
219-
} else if (codeType >= 2 && codeType <= 4) {
220-
System.arraycopy(inputBuffer, ptr, resultBuffer, writePos, codeType);
221-
writePos += codeType;
222-
ptr += codeType;
223-
}
127+
stringEnd = ptr + 1;
128+
if (backslashes.isEmpty()) {
129+
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
130+
} else {
131+
byte[] buff = new byte[ptr - startPtr - backslashes.size()];
132+
int copyPtr = startPtr;
133+
int destPtr = 0;
134+
for (Integer backslash : backslashes) {
135+
int length = backslash - copyPtr;
136+
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
137+
destPtr += length;
138+
copyPtr = backslash + 1;
224139
}
140+
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
141+
return new Text(new XContentString.UTF8Bytes(buff), stringLength);
225142
}
226-
227-
stringLength = codePointCount;
228-
return new Text(new XContentString.UTF8Bytes(resultBuffer), codePointCount);
229143
}
230144

145+
// protected Text _finishAndReturnText() throws IOException {
146+
// int ptr = _inputPtr;
147+
// if (ptr >= _inputEnd) {
148+
// _loadMoreGuaranteed();
149+
// ptr = _inputPtr;
150+
// }
151+
//
152+
// final int startPtr = ptr;
153+
// final byte[] inputBuffer = _inputBuffer;
154+
// final int max = _inputEnd;
155+
// int codePointCount = 0;
156+
//
157+
// // Fast path: scan for quote or backslash first, counting code points as we go
158+
// while (ptr < max) {
159+
// byte b = inputBuffer[ptr];
160+
// if (b == INT_QUOTE) {
161+
// // Found end quote - string has no escapes
162+
// int byteLength = ptr - startPtr;
163+
// stringLength = codePointCount;
164+
// stringEnd = ptr + 1;
165+
// return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, byteLength), codePointCount);
166+
// }
167+
// if (b == INT_BACKSLASH) {
168+
// // Found escape - switch to escape handling
169+
// break;
170+
// }
171+
// // For bytes < 128 (ASCII), we can skip the codes table lookup
172+
// if (b >= 0) {
173+
// codePointCount++;
174+
// ptr++;
175+
// } else {
176+
// // Non-ASCII handling...
177+
// int c = b & 0xFF;
178+
// int codeType = INPUT_CODES_UTF8[c];
179+
// if (codeType == 0) {
180+
// codePointCount++;
181+
// ptr++;
182+
// } else if (codeType >= 2 && codeType <= 4) {
183+
// if (ptr + codeType > max) {
184+
// return null;
185+
// }
186+
// // For 4-byte UTF-8 sequences (surrogate pairs in UTF-16)
187+
// if (codeType == 4) {
188+
// // Count as 2 UTF-16 code units
189+
// codePointCount += 2;
190+
// } else {
191+
// // 2-byte and 3-byte sequences = 1 UTF-16 code unit
192+
// codePointCount++;
193+
// }
194+
// ptr += codeType;
195+
// } else {
196+
// return null;
197+
// }
198+
// }
199+
// }
200+
//
201+
// // Escape handling path - continue counting code points during the scan
202+
// if (ptr >= max) {
203+
// return null;
204+
// }
205+
//
206+
// int[] escapePositions = new int[16]; // Small initial size
207+
//
208+
// int escapeCount = 0;
209+
// int scanPtr = ptr;
210+
//
211+
// // Scan to find escapes and end quote, continuing to count code points
212+
// while (scanPtr < max) {
213+
// byte b = inputBuffer[scanPtr];
214+
// if (b == INT_QUOTE) {
215+
// break; // Found end
216+
// }
217+
//
218+
// if (b == INT_BACKSLASH) {
219+
// // Grow array if needed
220+
// if (escapeCount >= escapePositions.length) {
221+
// int[] newArray = new int[escapePositions.length * 2];
222+
// System.arraycopy(escapePositions, 0, newArray, 0, escapeCount);
223+
// escapePositions = newArray;
224+
// }
225+
// escapePositions[escapeCount++] = scanPtr;
226+
//
227+
// scanPtr++;
228+
// if (scanPtr >= max) {
229+
// return null;
230+
// }
231+
// b = inputBuffer[scanPtr];
232+
// if (b == '"' || b == '/' || b == '\\') {
233+
// codePointCount++; // The escaped character counts as 1 code point
234+
// scanPtr++;
235+
// } else {
236+
// return null; // Unsupported escape
237+
// }
238+
// } else if (b >= 0) {
239+
// codePointCount++;
240+
// scanPtr++;
241+
// } else {
242+
// // Non-ASCII
243+
// int c = b & 0xFF;
244+
// int codeType = INPUT_CODES_UTF8[c];
245+
// if (codeType == 0) {
246+
// codePointCount++;
247+
// scanPtr++;
248+
// } else if (codeType >= 2 && codeType <= 4) {
249+
// if (scanPtr + codeType > max) {
250+
// return null;
251+
// }
252+
// codePointCount++;
253+
// scanPtr += codeType;
254+
// } else {
255+
// return null;
256+
// }
257+
// }
258+
// }
259+
//
260+
// if (scanPtr >= max) {
261+
// return null; // Didn't find closing quote
262+
// }
263+
//
264+
// stringEnd = scanPtr + 1;
265+
//
266+
// // Calculate exact byte size: total bytes minus number of backslashes
267+
// int exactByteSize = (scanPtr - startPtr) - escapeCount;
268+
//
269+
// // Allocate exact size buffer
270+
// byte[] resultBuffer = new byte[exactByteSize];
271+
// int writePos = 0;
272+
//
273+
// // Copy everything before the first backslash
274+
// int beforeEscapeLength = ptr - startPtr;
275+
// System.arraycopy(inputBuffer, startPtr, resultBuffer, 0, beforeEscapeLength);
276+
// writePos = beforeEscapeLength;
277+
//
278+
// // Second pass: process escapes (we already have the correct code point count)
279+
// while (ptr < scanPtr) {
280+
// byte b = inputBuffer[ptr];
281+
//
282+
// if (b == INT_BACKSLASH) {
283+
// ptr++; // Skip backslash
284+
// b = inputBuffer[ptr]; // Get escaped character
285+
// resultBuffer[writePos++] = b;
286+
// ptr++;
287+
// } else if (b >= 0) {
288+
// // ASCII
289+
// resultBuffer[writePos++] = b;
290+
// ptr++;
291+
// } else {
292+
// // Non-ASCII - copy multi-byte sequence
293+
// int c = b & 0xFF;
294+
// int codeType = INPUT_CODES_UTF8[c];
295+
// if (codeType == 0) {
296+
// resultBuffer[writePos++] = b;
297+
// ptr++;
298+
// } else if (codeType >= 2 && codeType <= 4) {
299+
// System.arraycopy(inputBuffer, ptr, resultBuffer, writePos, codeType);
300+
// writePos += codeType;
301+
// ptr += codeType;
302+
// }
303+
// }
304+
// }
305+
//
306+
// stringLength = codePointCount;
307+
// return new Text(new XContentString.UTF8Bytes(resultBuffer), codePointCount);
308+
// }
309+
231310
public boolean writeUTF8TextToStream(OutputStream out) throws IOException {
232311
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
233312
if (stringEnd > 0) {

server/src/main/java/org/elasticsearch/ingest/ESONFlat.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ public BytesReference getSerializedKeyBytes() {
7575
streamOutput.writeVInt(keys.size());
7676
for (ESONEntry entry : keys) {
7777
String key = entry.key() == null ? "" : entry.key();
78-
// byte[] bytes = key == null ? EMPTY_KEY : key.getBytes(StandardCharsets.UTF_8);
79-
// streamOutput.writeVInt(bytes.length);
80-
// streamOutput.writeBytes(bytes, 0, bytes.length);
81-
streamOutput.writeUTF8String(key);
78+
byte[] bytes = key == null ? EMPTY_KEY : key.getBytes(StandardCharsets.UTF_8);
79+
streamOutput.writeVInt(bytes.length);
80+
streamOutput.writeBytes(bytes, 0, bytes.length);
81+
// streamOutput.writeUTF8String(key);
8282
streamOutput.writeByte(entry.type());
8383
streamOutput.writeInt(entry.offsetOrCount());
8484
}

0 commit comments

Comments
 (0)