11<?php
22
33/**
4- * Fallback mechanism for safely validating UTF-8 bytes.
4+ * Finds spans of valid and invalid UTF-8 bytes in a given string.
5+ *
6+ * This is a low-level tool to power various UTF-8 functionality.
7+ * It scans through a string until it finds invalid byte spans.
8+ * When it does this, it does three things:
9+ *
10+ * - Assigns `$at` to the position after the last successful code point.
11+ * - Assigns `$invalid_length` to the length of the maximal subpart of
12+ * the invalid bytes starting at `$at`.
13+ * - Returns how many code points were successfully scanned.
14+ *
15+ * This information is enough to build a number of useful UTF-8 functions.
16+ *
17+ * Example:
18+ *
19+ * // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
20+ * "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
21+ * $at = $invalid_length = 0;
22+ *
23+ * // The first step finds the invalid 0xF1 byte.
24+ * 2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
25+ * $at === 2; $invalid_length === 1;
526 *
6- * By implementing a raw method here the code will behave in the same way on
7- * all installed systems, regardless of what extensions are installed.
27+ * // The second step continues to the end of the string.
28+ * 1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
29+ * $at === 4; $invalid_length === 0;
830 *
9- * @see wp_is_valid_utf8
31+ * Note! This functions many arguments are passed without and “options”
32+ * array. This choice is based on the fact that this is a low-level function
33+ * and there’s no need to create an array of items on every invocation.
1034 *
1135 * @since 6.9.0
1236 * @access private
1337 *
14- * @param string $bytes String which might contain text encoded as UTF-8.
15- * @return bool Whether the provided bytes can decode as valid UTF-8.
38+ * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39+ * @param int $at Where to start scanning.
40+ * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41+ * @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42+ * @param int|null $max_code_points Stop scanning after this many code points have been seen.
43+ * @return int How many code points were successfully scanned.
1644 */
17- function _wp_is_valid_utf8_fallback ( string $ bytes ): bool {
18- $ end = strlen ( $ bytes );
19-
20- for ( $ i = 0 ; $ i < $ end ; $ i ++ ) {
45+ function _wp_scan_utf8 ( string $ bytes , int &$ at , int &$ invalid_length , ?int $ max_bytes = null , ?int $ max_code_points = null ): int {
46+ $ byte_length = strlen ( $ bytes );
47+ $ end = min ( $ byte_length , $ at + ( $ max_bytes ?? PHP_INT_MAX ) );
48+ $ invalid_length = 0 ;
49+ $ count = 0 ;
50+ $ max_count = $ max_code_points ?? PHP_INT_MAX ;
51+
52+ for ( $ i = $ at ; $ i < $ end && $ count <= $ max_count ; $ i ++ ) {
2153 /*
2254 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
2355 *
2456 * This optimization step improves the speed from 10x to 100x
2557 * depending on whether the JIT has optimized the function.
2658 */
27- $ i + = strspn (
59+ $ ascii_byte_count = strspn (
2860 $ bytes ,
2961 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
3062 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
3163 " ! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \\]^_`abcdefghijklmnopqrstuvwxyz{|}~ \x7f" ,
32- $ i
64+ $ i ,
65+ $ end - $ i
3366 );
67+
68+ if ( $ count + $ ascii_byte_count >= $ max_count ) {
69+ $ at = $ i + ( $ max_count - $ count );
70+ $ count = $ max_count ;
71+ return $ count ;
72+ }
73+
74+ $ count += $ ascii_byte_count ;
75+ $ i += $ ascii_byte_count ;
76+
3477 if ( $ i >= $ end ) {
35- break ;
78+ $ at = $ end ;
79+ return $ count ;
3680 }
3781
3882 /**
3983 * The above fast-track handled all single-byte UTF-8 characters. What
4084 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
4185 *
4286 * Therefore everything past here is checking those multibyte sequences.
87+ *
88+ * It may look like there’s a need to check against the max bytes here,
89+ * but since each match of a single character returns, this functions will
90+ * bail already if crossing the max-bytes threshold. This function SHALL
91+ * NOT return in the middle of a multi-byte character, so if a character
92+ * falls on each side of the max bytes, the entire character will be scanned.
93+ *
4394 * Because it’s possible that there are truncated characters, the use of
4495 * the null-coalescing operator with "\xC0" is a convenience for skipping
4596 * length checks on every continuation bytes. This works because 0xC0 is
4697 * always invalid in a UTF-8 string, meaning that if the string has been
4798 * truncated, it will find 0xC0 and reject as invalid UTF-8.
4899 *
49- * > [The following table] lists all of the byte sequences that are well-formed
100+ * > [The following table] lists all of the byte sequences that are well-formed
50101 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
51102 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
52103 * > outside of the ranges listed is ill-formed.
@@ -66,29 +117,24 @@ function _wp_is_valid_utf8_fallback( string $bytes ): bool {
66117 * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │
67118 * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
68119 *
69- * Notice that all valid third and forth bytes are in the range 80..BF. This
70- * validator takes advantage of that to only check the range of those bytes once.
71- *
72- * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
73120 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
74121 */
75122
123+ // Valid two-byte code points.
76124 $ b1 = ord ( $ bytes [ $ i ] );
77125 $ b2 = ord ( $ bytes [ $ i + 1 ] ?? "\xC0" );
78126
79- // Valid two-byte code points.
80-
81127 if ( $ b1 >= 0xC2 && $ b1 <= 0xDF && $ b2 >= 0x80 && $ b2 <= 0xBF ) {
128+ ++$ count ;
82129 ++$ i ;
83130 continue ;
84131 }
85132
86- $ b3 = ord ( $ bytes [ $ i + 2 ] ?? "\xC0" );
87-
88133 // Valid three-byte code points.
134+ $ b3 = ord ( $ bytes [ $ i + 2 ] ?? "\xC0" );
89135
90136 if ( $ b3 < 0x80 || $ b3 > 0xBF ) {
91- return false ;
137+ goto invalid_utf8 ;
92138 }
93139
94140 if (
@@ -97,31 +143,108 @@ function _wp_is_valid_utf8_fallback( string $bytes ): bool {
97143 ( 0xED === $ b1 && $ b2 >= 0x80 && $ b2 <= 0x9F ) ||
98144 ( $ b1 >= 0xEE && $ b1 <= 0xEF && $ b2 >= 0x80 && $ b2 <= 0xBF )
99145 ) {
146+ ++$ count ;
100147 $ i += 2 ;
101148 continue ;
102149 }
103150
104- $ b4 = ord ( $ bytes [ $ i + 3 ] ?? "\xC0" );
105-
106151 // Valid four-byte code points.
152+ $ b4 = ord ( $ bytes [ $ i + 3 ] ?? "\xC0" );
107153
108154 if ( $ b4 < 0x80 || $ b4 > 0xBF ) {
109- return false ;
155+ goto invalid_utf8 ;
110156 }
111157
112158 if (
113159 ( 0xF0 === $ b1 && $ b2 >= 0x90 && $ b2 <= 0xBF ) ||
114160 ( $ b1 >= 0xF1 && $ b1 <= 0xF3 && $ b2 >= 0x80 && $ b2 <= 0xBF ) ||
115161 ( 0xF4 === $ b1 && $ b2 >= 0x80 && $ b2 <= 0x8F )
116162 ) {
163+ ++$ count ;
117164 $ i += 3 ;
118165 continue ;
119166 }
120167
121- // Any other sequence is invalid.
122- return false ;
168+ /**
169+ * When encountering invalid byte sequences, Unicode suggests finding the
170+ * maximal subpart of a text and replacing that subpart with a single
171+ * replacement character.
172+ *
173+ * > This practice is more secure because it does not result in the
174+ * > conversion consuming parts of valid sequences as though they were
175+ * > invalid. It also guarantees at least one replacement character will
176+ * > occur for each instance of an invalid sequence in the original text.
177+ * > Furthermore, this practice can be defined consistently for better
178+ * > interoperability between different implementations of conversion.
179+ *
180+ * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
181+ */
182+ invalid_utf8:
183+ $ at = $ i ;
184+ $ invalid_length = 1 ;
185+
186+ // Single-byte and two-byte characters.
187+ if ( ( 0x00 === ( $ b1 & 0x80 ) ) || ( 0xC0 === ( $ b1 & 0xE0 ) ) ) {
188+ return $ count ;
189+ }
190+
191+ $ b2 = ord ( $ bytes [ $ i + 1 ] ?? "\xC0" );
192+ $ b3 = ord ( $ bytes [ $ i + 2 ] ?? "\xC0" );
193+
194+ // Find the maximal subpart and skip past it.
195+ if ( 0xE0 === ( $ b1 & 0xF0 ) ) {
196+ // Three-byte characters.
197+ $ b2_valid = (
198+ ( 0xE0 === $ b1 && $ b2 >= 0xA0 && $ b2 <= 0xBF ) ||
199+ ( $ b1 >= 0xE1 && $ b1 <= 0xEC && $ b2 >= 0x80 && $ b2 <= 0xBF ) ||
200+ ( 0xED === $ b1 && $ b2 >= 0x80 && $ b2 <= 0x9F ) ||
201+ ( $ b1 >= 0xEE && $ b1 <= 0xEF && $ b2 >= 0x80 && $ b2 <= 0xBF )
202+ );
203+
204+ $ invalid_length = min ( $ end - $ i , $ b2_valid ? 2 : 1 );
205+ return $ count ;
206+ } elseif ( 0xF0 === ( $ b1 & 0xF8 ) ) {
207+ // Four-byte characters.
208+ $ b2_valid = (
209+ ( 0xF0 === $ b1 && $ b2 >= 0x90 && $ b2 <= 0xBF ) ||
210+ ( $ b1 >= 0xF1 && $ b1 <= 0xF3 && $ b2 >= 0x80 && $ b2 <= 0xBF ) ||
211+ ( 0xF4 === $ b1 && $ b2 >= 0x80 && $ b2 <= 0x8F )
212+ );
213+
214+ $ b3_valid = $ b3 >= 0x80 && $ b3 <= 0xBF ;
215+
216+ $ invalid_length = min ( $ end - $ i , $ b2_valid ? ( $ b3_valid ? 3 : 2 ) : 1 );
217+ return $ count ;
218+ }
219+
220+ return $ count ;
221+ }
222+
223+ $ at = $ i ;
224+ return $ count ;
225+ }
226+
227+ /**
228+ * Fallback mechanism for safely validating UTF-8 bytes.
229+ *
230+ * @see wp_is_valid_utf8()
231+ *
232+ * @since 6.9.0
233+ * @access private
234+ *
235+ * @param string $bytes String which might contain text encoded as UTF-8.
236+ * @return bool Whether the provided bytes can decode as valid UTF-8.
237+ */
238+ function _wp_is_valid_utf8_fallback ( string $ bytes ): bool {
239+ $ bytes_length = strlen ( $ bytes );
240+ if ( 0 === $ bytes_length ) {
241+ return true ;
123242 }
124243
125- // Reaching the end implies validating every byte.
126- return true ;
244+ $ next_byte_at = 0 ;
245+ $ invalid_length = 0 ;
246+
247+ _wp_scan_utf8 ( $ bytes , $ next_byte_at , $ invalid_length );
248+
249+ return $ bytes_length === $ next_byte_at && 0 === $ invalid_length ;
127250}
0 commit comments