@@ -117,7 +117,7 @@ public static function unescapeString(string $str, string $quoteChar): string
117117 't ' => "\t" ,
118118 'u ' => self ::unescapeUnicodeSequence ($ str , $ i ),
119119 $ quoteChar => $ quoteChar ,
120- default => throw new JsonCrawlerException ('' , \sprintf ('Invalid escape sequence " \\%s" in %s-quoted string ' , $ str [$ i + 1 ], "' " === $ quoteChar ? 'single ' : 'double ' )),
120+ default => throw new JsonCrawlerException ('' , \sprintf ('Invalid escape sequence " \\%s" in %s-quoted string. ' , $ str [$ i + 1 ], "' " === $ quoteChar ? 'single ' : 'double ' )),
121121 };
122122
123123 ++$ i ;
@@ -132,30 +132,33 @@ public static function unescapeString(string $str, string $quoteChar): string
132132 private static function unescapeUnicodeSequence (string $ str , int &$ i ): string
133133 {
134134 if (!isset ($ str [$ i + 5 ]) || !ctype_xdigit (substr ($ str , $ i + 2 , 4 ))) {
135- throw new JsonCrawlerException ('' , 'Invalid unicode escape sequence ' );
135+ throw new JsonCrawlerException ('' , 'Invalid unicode escape sequence. ' );
136136 }
137137
138- $ hex = substr ($ str , $ i + 2 , 4 );
138+ $ codepoint = hexdec ( substr ($ str , $ i + 2 , 4 ) );
139139
140- $ codepoint = hexdec ($ hex );
141140 // looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
142- if (0xD800 <= $ codepoint && $ codepoint <= 0xDBFF && isset ($ str [$ i + 11 ]) && '\\' === $ str [$ i + 6 ] && 'u ' === $ str [$ i + 7 ]) {
143- $ lowHex = substr ($ str , $ i + 8 , 4 );
144- if (ctype_xdigit ($ lowHex )) {
145- $ lowSurrogate = hexdec ($ lowHex );
146- if (0xDC00 <= $ lowSurrogate && $ lowSurrogate <= 0xDFFF ) {
147- $ codepoint = 0x10000 + (($ codepoint & 0x3FF ) << 10 ) + ($ lowSurrogate & 0x3FF );
148- $ i += 10 ; // skip surrogate pair
149-
150- return mb_chr ($ codepoint , 'UTF-8 ' );
151- }
152- }
141+ if (0xD800 <= $ codepoint
142+ && $ codepoint <= 0xDBFF
143+ && isset ($ str [$ i + 11 ])
144+ && '\\' === $ str [$ i + 6 ]
145+ && 'u ' === $ str [$ i + 7 ]
146+ && ctype_xdigit ($ lowSurrogate = substr ($ str , $ i + 8 , 4 ))
147+ && 0xDC00 <= ($ lowSurrogate = hexdec ($ lowSurrogate ))
148+ && $ lowSurrogate <= 0xDFFF
149+ ) {
150+ $ codepoint = 0x10000 + (($ codepoint & 0x3FF ) << 10 ) + ($ lowSurrogate & 0x3FF );
151+ $ i += 10 ; // skip surrogate pair
152+ } else {
153+ // single Unicode character or invalid surrogate, skip the sequence
154+ $ i += 4 ;
153155 }
154156
155- // single Unicode character or invalid surrogate, skip the sequence
156- $ i += 4 ;
157+ if (false === $ chr = mb_chr ($ codepoint , 'UTF-8 ' )) {
158+ throw new JsonCrawlerException ('' , \sprintf ('Invalid Unicode codepoint: U+%04X. ' , $ codepoint ));
159+ }
157160
158- return mb_chr ( $ codepoint , ' UTF-8 ' ) ;
161+ return $ chr ;
159162 }
160163
161164 /**
0 commit comments