Fix

Tim-Brooks · Tim-Brooks · commit ee9eb518714c · 2025-08-19T11:08:25.000-10:00
diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java
@@ -70,164 +70,243 @@ protected Text _finishAndReturnText() throws IOException {
             ptr = _inputPtr;
         }
 
-        final int startPtr = ptr;
-        final byte[] inputBuffer = _inputBuffer;
+        int startPtr = ptr;
+        final int[] codes = INPUT_CODES_UTF8;
         final int max = _inputEnd;
-        int codePointCount = 0;
+        final byte[] inputBuffer = _inputBuffer;
+        stringLength = 0;
+        backslashes.clear();
 
-        // Fast path: scan for quote or backslash first, counting code points as we go
-        while (ptr < max) {
-            byte b = inputBuffer[ptr];
-            if (b == INT_QUOTE) {
-                // Found end quote - string has no escapes
-                int byteLength = ptr - startPtr;
-                stringLength = codePointCount;
-                stringEnd = ptr + 1;
-                return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, byteLength), codePointCount);
-            }
-            if (b == INT_BACKSLASH) {
-                // Found escape - switch to escape handling
-                break;
+        loop: while (true) {
+            if (ptr >= max) {
+                return null;
             }
-            // For bytes < 128 (ASCII), we can skip the codes table lookup
-            if (b >= 0) {
-                codePointCount++;
-                ptr++;
-            } else {
-                // Non-ASCII handling...
-                int c = b & 0xFF;
-                int codeType = INPUT_CODES_UTF8[c];
-                if (codeType == 0) {
-                    codePointCount++;
-                    ptr++;
-                } else if (codeType >= 2 && codeType <= 4) {
-                    if (ptr + codeType > max) {
+            int c = inputBuffer[ptr] & 0xFF;
+            switch (codes[c]) {
+                case 0 -> {
+                    ++ptr;
+                    ++stringLength;
+                }
+                case 1 -> {
+                    if (c == INT_QUOTE) {
+                        // End of the string
+                        break loop;
+                    }
+                    assert c == INT_BACKSLASH;
+                    backslashes.add(ptr);
+                    ++ptr;
+                    if (ptr >= max) {
+                        // Backslash at end of file
                         return null;
                     }
-                    // For 4-byte UTF-8 sequences (surrogate pairs in UTF-16)
-                    if (codeType == 4) {
-                        // Count as 2 UTF-16 code units
-                        codePointCount += 2;
+                    c = inputBuffer[ptr] & 0xFF;
+                    if (c == '"' || c == '/' || c == '\\') {
+                        ptr += 1;
+                        stringLength += 1;
                     } else {
-                        // 2-byte and 3-byte sequences = 1 UTF-16 code unit
-                        codePointCount++;
+                        // Any other escaped sequence requires replacing the sequence with
+                        // a new character, which we don't support in the optimized path
+                        return null;
                     }
-                    ptr += codeType;
-                } else {
-                    return null;
-                }
-            }
-        }
-
-        // Escape handling path - continue counting code points during the scan
-        if (ptr >= max) {
-            return null;
-        }
-
-        int[] escapePositions = new int[16]; // Small initial size
-
-        int escapeCount = 0;
-        int scanPtr = ptr;
-
-        // Scan to find escapes and end quote, continuing to count code points
-        while (scanPtr < max) {
-            byte b = inputBuffer[scanPtr];
-            if (b == INT_QUOTE) {
-                break; // Found end
-            }
-
-            if (b == INT_BACKSLASH) {
-                // Grow array if needed
-                if (escapeCount >= escapePositions.length) {
-                    int[] newArray = new int[escapePositions.length * 2];
-                    System.arraycopy(escapePositions, 0, newArray, 0, escapeCount);
-                    escapePositions = newArray;
-                }
-                escapePositions[escapeCount++] = scanPtr;
-
-                scanPtr++;
-                if (scanPtr >= max) {
-                    return null;
                 }
-                b = inputBuffer[scanPtr];
-                if (b == '"' || b == '/' || b == '\\') {
-                    codePointCount++; // The escaped character counts as 1 code point
-                    scanPtr++;
-                } else {
-                    return null; // Unsupported escape
-                }
-            } else if (b >= 0) {
-                codePointCount++;
-                scanPtr++;
-            } else {
-                // Non-ASCII
-                int c = b & 0xFF;
-                int codeType = INPUT_CODES_UTF8[c];
-                if (codeType == 0) {
-                    codePointCount++;
-                    scanPtr++;
-                } else if (codeType >= 2 && codeType <= 4) {
-                    if (scanPtr + codeType > max) {
+                case 2, 3, 4 -> {
+                    int bytesToSkip = codes[c];
+                    if (ptr + bytesToSkip > max) {
                         return null;
                     }
-                    codePointCount++;
-                    scanPtr += codeType;
-                } else {
+                    ptr += bytesToSkip;
+                    // Code points that require 4 bytes in UTF-8 will use 2 chars in UTF-16.
+                    stringLength += (bytesToSkip == 4 ? 2 : 1);
+                }
+                default -> {
                     return null;
                 }
             }
         }
 
-        if (scanPtr >= max) {
-            return null; // Didn't find closing quote
-        }
-
-        stringEnd = scanPtr + 1;
-
-        // Calculate exact byte size: total bytes minus number of backslashes
-        int exactByteSize = (scanPtr - startPtr) - escapeCount;
-
-        // Allocate exact size buffer
-        byte[] resultBuffer = new byte[exactByteSize];
-        int writePos = 0;
-
-        // Copy everything before the first backslash
-        int beforeEscapeLength = ptr - startPtr;
-        System.arraycopy(inputBuffer, startPtr, resultBuffer, 0, beforeEscapeLength);
-        writePos = beforeEscapeLength;
-
-        // Second pass: process escapes (we already have the correct code point count)
-        while (ptr < scanPtr) {
-            byte b = inputBuffer[ptr];
-
-            if (b == INT_BACKSLASH) {
-                ptr++; // Skip backslash
-                b = inputBuffer[ptr]; // Get escaped character
-                resultBuffer[writePos++] = b;
-                ptr++;
-            } else if (b >= 0) {
-                // ASCII
-                resultBuffer[writePos++] = b;
-                ptr++;
-            } else {
-                // Non-ASCII - copy multi-byte sequence
-                int c = b & 0xFF;
-                int codeType = INPUT_CODES_UTF8[c];
-                if (codeType == 0) {
-                    resultBuffer[writePos++] = b;
-                    ptr++;
-                } else if (codeType >= 2 && codeType <= 4) {
-                    System.arraycopy(inputBuffer, ptr, resultBuffer, writePos, codeType);
-                    writePos += codeType;
-                    ptr += codeType;
-                }
+        stringEnd = ptr + 1;
+        if (backslashes.isEmpty()) {
+            return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
+        } else {
+            byte[] buff = new byte[ptr - startPtr - backslashes.size()];
+            int copyPtr = startPtr;
+            int destPtr = 0;
+            for (Integer backslash : backslashes) {
+                int length = backslash - copyPtr;
+                System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
+                destPtr += length;
+                copyPtr = backslash + 1;
             }
+            System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
+            return new Text(new XContentString.UTF8Bytes(buff), stringLength);
         }
-
-        stringLength = codePointCount;
-        return new Text(new XContentString.UTF8Bytes(resultBuffer), codePointCount);
     }
 
+    // protected Text _finishAndReturnText() throws IOException {
+    // int ptr = _inputPtr;
+    // if (ptr >= _inputEnd) {
+    // _loadMoreGuaranteed();
+    // ptr = _inputPtr;
+    // }
+    //
+    // final int startPtr = ptr;
+    // final byte[] inputBuffer = _inputBuffer;
+    // final int max = _inputEnd;
+    // int codePointCount = 0;
+    //
+    // // Fast path: scan for quote or backslash first, counting code points as we go
+    // while (ptr < max) {
+    // byte b = inputBuffer[ptr];
+    // if (b == INT_QUOTE) {
+    // // Found end quote - string has no escapes
+    // int byteLength = ptr - startPtr;
+    // stringLength = codePointCount;
+    // stringEnd = ptr + 1;
+    // return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, byteLength), codePointCount);
+    // }
+    // if (b == INT_BACKSLASH) {
+    // // Found escape - switch to escape handling
+    // break;
+    // }
+    // // For bytes < 128 (ASCII), we can skip the codes table lookup
+    // if (b >= 0) {
+    // codePointCount++;
+    // ptr++;
+    // } else {
+    // // Non-ASCII handling...
+    // int c = b & 0xFF;
+    // int codeType = INPUT_CODES_UTF8[c];
+    // if (codeType == 0) {
+    // codePointCount++;
+    // ptr++;
+    // } else if (codeType >= 2 && codeType <= 4) {
+    // if (ptr + codeType > max) {
+    // return null;
+    // }
+    // // For 4-byte UTF-8 sequences (surrogate pairs in UTF-16)
+    // if (codeType == 4) {
+    // // Count as 2 UTF-16 code units
+    // codePointCount += 2;
+    // } else {
+    // // 2-byte and 3-byte sequences = 1 UTF-16 code unit
+    // codePointCount++;
+    // }
+    // ptr += codeType;
+    // } else {
+    // return null;
+    // }
+    // }
+    // }
+    //
+    // // Escape handling path - continue counting code points during the scan
+    // if (ptr >= max) {
+    // return null;
+    // }
+    //
+    // int[] escapePositions = new int[16]; // Small initial size
+    //
+    // int escapeCount = 0;
+    // int scanPtr = ptr;
+    //
+    // // Scan to find escapes and end quote, continuing to count code points
+    // while (scanPtr < max) {
+    // byte b = inputBuffer[scanPtr];
+    // if (b == INT_QUOTE) {
+    // break; // Found end
+    // }
+    //
+    // if (b == INT_BACKSLASH) {
+    // // Grow array if needed
+    // if (escapeCount >= escapePositions.length) {
+    // int[] newArray = new int[escapePositions.length * 2];
+    // System.arraycopy(escapePositions, 0, newArray, 0, escapeCount);
+    // escapePositions = newArray;
+    // }
+    // escapePositions[escapeCount++] = scanPtr;
+    //
+    // scanPtr++;
+    // if (scanPtr >= max) {
+    // return null;
+    // }
+    // b = inputBuffer[scanPtr];
+    // if (b == '"' || b == '/' || b == '\\') {
+    // codePointCount++; // The escaped character counts as 1 code point
+    // scanPtr++;
+    // } else {
+    // return null; // Unsupported escape
+    // }
+    // } else if (b >= 0) {
+    // codePointCount++;
+    // scanPtr++;
+    // } else {
+    // // Non-ASCII
+    // int c = b & 0xFF;
+    // int codeType = INPUT_CODES_UTF8[c];
+    // if (codeType == 0) {
+    // codePointCount++;
+    // scanPtr++;
+    // } else if (codeType >= 2 && codeType <= 4) {
+    // if (scanPtr + codeType > max) {
+    // return null;
+    // }
+    // codePointCount++;
+    // scanPtr += codeType;
+    // } else {
+    // return null;
+    // }
+    // }
+    // }
+    //
+    // if (scanPtr >= max) {
+    // return null; // Didn't find closing quote
+    // }
+    //
+    // stringEnd = scanPtr + 1;
+    //
+    // // Calculate exact byte size: total bytes minus number of backslashes
+    // int exactByteSize = (scanPtr - startPtr) - escapeCount;
+    //
+    // // Allocate exact size buffer
+    // byte[] resultBuffer = new byte[exactByteSize];
+    // int writePos = 0;
+    //
+    // // Copy everything before the first backslash
+    // int beforeEscapeLength = ptr - startPtr;
+    // System.arraycopy(inputBuffer, startPtr, resultBuffer, 0, beforeEscapeLength);
+    // writePos = beforeEscapeLength;
+    //
+    // // Second pass: process escapes (we already have the correct code point count)
+    // while (ptr < scanPtr) {
+    // byte b = inputBuffer[ptr];
+    //
+    // if (b == INT_BACKSLASH) {
+    // ptr++; // Skip backslash
+    // b = inputBuffer[ptr]; // Get escaped character
+    // resultBuffer[writePos++] = b;
+    // ptr++;
+    // } else if (b >= 0) {
+    // // ASCII
+    // resultBuffer[writePos++] = b;
+    // ptr++;
+    // } else {
+    // // Non-ASCII - copy multi-byte sequence
+    // int c = b & 0xFF;
+    // int codeType = INPUT_CODES_UTF8[c];
+    // if (codeType == 0) {
+    // resultBuffer[writePos++] = b;
+    // ptr++;
+    // } else if (codeType >= 2 && codeType <= 4) {
+    // System.arraycopy(inputBuffer, ptr, resultBuffer, writePos, codeType);
+    // writePos += codeType;
+    // ptr += codeType;
+    // }
+    // }
+    // }
+    //
+    // stringLength = codePointCount;
+    // return new Text(new XContentString.UTF8Bytes(resultBuffer), codePointCount);
+    // }
+
     public boolean writeUTF8TextToStream(OutputStream out) throws IOException {
         if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
             if (stringEnd > 0) {
diff --git a/server/src/main/java/org/elasticsearch/ingest/ESONFlat.java b/server/src/main/java/org/elasticsearch/ingest/ESONFlat.java
@@ -75,10 +75,10 @@ public BytesReference getSerializedKeyBytes() {
                 streamOutput.writeVInt(keys.size());
                 for (ESONEntry entry : keys) {
                     String key = entry.key() == null ? "" : entry.key();
-                    // byte[] bytes = key == null ? EMPTY_KEY : key.getBytes(StandardCharsets.UTF_8);
-                    // streamOutput.writeVInt(bytes.length);
-                    // streamOutput.writeBytes(bytes, 0, bytes.length);
-                    streamOutput.writeUTF8String(key);
+                    byte[] bytes = key == null ? EMPTY_KEY : key.getBytes(StandardCharsets.UTF_8);
+                    streamOutput.writeVInt(bytes.length);
+                    streamOutput.writeBytes(bytes, 0, bytes.length);
+                    // streamOutput.writeUTF8String(key);
                     streamOutput.writeByte(entry.type());
                     streamOutput.writeInt(entry.offsetOrCount());
                 }