From fbb56daa279e65ad6905ec1b73d94f1eda7e291b Mon Sep 17 00:00:00 2001 From: Balbino Gamboa Date: Fri, 21 Nov 2025 14:01:40 -0500 Subject: [PATCH] Fix: Enforce strict 4-digit limit on JSON Unicode escapes to prevent greedy parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The JSON lexer was incorrectly parsing unicode escapes by consuming more characters than the standard 4 hex digits specified in RFC 8259. This could lead to incorrect parsing when valid hex characters followed a unicode escape sequence. Changes: - Refactored appendHex() to strictly parse exactly 4 hex digits - Replaced fromHexChar() with a fast O(1) lookup table (HEX_TABLE) - Added explicit validation that fails on invalid hex digits - Added test case to verify correct parsing of \u00f3a as 'óa' --- .../serialization/json/JsonParserTest.kt | 8 ++++ .../json/internal/lexer/AbstractJsonLexer.kt | 48 ++++++++++++------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonParserTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonParserTest.kt index 94f7052cd4..69e4c392ed 100644 --- a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonParserTest.kt +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonParserTest.kt @@ -113,4 +113,12 @@ class JsonParserTest : JsonTestBase() { assertTrue { value.jsonPrimitive.isString } assertEquals("null", obj["k"]!!.jsonPrimitive.content) } + + @Test + fun testUnicodeEscapeWithFollowingHex() { + // Test case for greedy parsing bug + val input = "\"\\u00f3a\"" + val decoded = Json.decodeFromString(input) + assertEquals("óa", decoded, "Should parse 'ó' then 'a', not try to consume 'a'") + } } diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt index 5f570a95ec..85a2704470 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt @@ -148,6 +148,17 @@ internal abstract class AbstractJsonLexer { protected abstract val source: CharSequence + // Lookup table for fast hex digit validation and conversion + companion object { + private val HEX_TABLE = IntArray(128) { -1 } + + init { + for (i in '0'..'9') HEX_TABLE[i.code] = i - '0' + for (i in 'a'..'f') HEX_TABLE[i.code] = i - 'a' + 10 + for (i in 'A'..'F') HEX_TABLE[i.code] = i - 'A' + 10 + } + } + @JvmField internal var currentPosition: Int = 0 // position in source @@ -498,6 +509,7 @@ internal abstract class AbstractJsonLexer { } private fun appendHex(source: CharSequence, startPos: Int): Int { + // Ensure we have at least 4 characters for the unicode sequence if (startPos + 4 >= source.length) { currentPosition = startPos ensureHaveChars() @@ -505,12 +517,25 @@ internal abstract class AbstractJsonLexer { fail("Unexpected EOF during unicode escape") return appendHex(source, currentPosition) } - escapedString.append( - ((fromHexChar(source, startPos) shl 12) + - (fromHexChar(source, startPos + 1) shl 8) + - (fromHexChar(source, startPos + 2) shl 4) + - fromHexChar(source, startPos + 3)).toChar() - ) + + var value = 0 + // Strict 4-iteration loop to prevent greedy parsing and comply with RFC 8259 + for (i in 0..3) { + val char = source[startPos + i] + val code = char.code + + // Fast O(1) lookup. Check range to avoid IndexOutOfBounds for non-ASCII chars + val digit = if (code < 128) HEX_TABLE[code] else -1 + + if (digit == -1) { + fail("Invalid Unicode escape sequence: expected hex digit, found '$char'") + } + + // Accumulate result + value = (value shl 4) or digit + } + + escapedString.append(value.toChar()) return startPos + 4 } @@ -518,15 +543,6 @@ internal abstract class AbstractJsonLexer { if (!condition) fail(message(), position) } - private fun fromHexChar(source: CharSequence, currentPosition: Int): Int { - return when (val character = source[currentPosition]) { - in '0'..'9' -> character.code - '0'.code - in 'a'..'f' -> character.code - 'a'.code + 10 - in 'A'..'F' -> character.code - 'A'.code + 10 - else -> fail("Invalid toHexChar char '$character' in unicode escape") - } - } - fun skipElement(allowLenientStrings: Boolean) { val tokenStack = mutableListOf() var lastToken = peekNextToken() @@ -759,4 +775,4 @@ internal abstract class AbstractJsonLexer { currentPosition = snapshot } } -} +} \ No newline at end of file