@@ -2128,134 +2128,6 @@ function sanitize_file_name( $filename ) {
21282128 return apply_filters ( 'sanitize_file_name ' , $ filename , $ filename_raw );
21292129}
21302130
2131- /**
2132- * Validates that a string contains only characters from a single unicode script.
2133- *
2134- * The function only considers alphabetic characters. It returns true if a string
2135- * contains no more than one unicode script, and false if it contains two or more.
2136- * An empty string is considered to contain no scripts, and thus returns true.
2137- *
2138- * IntlChar does not support returning the script property defined by
2139- * https://www.unicode.org/reports/tr24/, so this implementation uses a workaround.
2140- * It maps the known extension blocks ("latin extended a" etc) to the first block
2141- * for that script, and then checks that the string uses only a single block.
2142- *
2143- * This works for the scripts currently in Unicode, and should continue to work for
2144- * future scripts as long as each new script needs a single code block. While older
2145- * scripts may have multiple blocks, the Unicode committee has grown better at
2146- * estimating sizes high enough so that only one block is needed.
2147- *
2148- * @since 6.9.0
2149- *
2150- * @param string $input A string to check.
2151- * @return bool True if all letters in the string belong to the same unicode
2152- * script or if the string is empty.
2153- * False if letters from two more more scripts are included.
2154- */
2155- function uses_single_unicode_script ( string $ input ): bool {
2156- if ( '' === $ input ) {
2157- return true ;
2158- }
2159-
2160- if ( version_compare ( PHP_VERSION , '7.4.0 ' , '< ' ) ) {
2161- // Since mb_str_split is not available in PHP < 7.4 we can only check ASCII characters.
2162- return (bool ) preg_match ( '/^[a-zA-Z0-9 _.\-@]+$/i ' , $ input );
2163- }
2164-
2165- $ block = 0 ;
2166- // phpcs:ignore PHPCompatibility.FunctionUse.NewFunctions.mb_str_splitFound -- old versions of PHP are handled above
2167- foreach ( mb_str_split ( $ input ) as $ cp ) {
2168- if ( IntlChar::isalpha ( $ cp ) ) {
2169- $ b = IntlChar::getBlockCode ( $ cp );
2170- switch ( $ b ) {
2171- case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT :
2172- // fall through
2173- case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A :
2174- // fall through
2175- case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B :
2176- case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C :
2177- case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D :
2178- case IntlChar::BLOCK_CODE_IPA_EXTENSIONS : // used in Ghana etc
2179- case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL :
2180- $ b = IntlChar::BLOCK_CODE_BASIC_LATIN ;
2181- break ;
2182- case IntlChar::BLOCK_CODE_GREEK_EXTENDED :
2183- case IntlChar::BLOCK_CODE_COPTIC :
2184- case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS :
2185- // Greek and coptic overlap. Coptic
2186- // looks like Greek upper case, so
2187- // readers of Greek can read Coptic,
2188- // but readers of Coptic can't
2189- // necessarily read Greek. This led to
2190- // an unfortunate situation in
2191- // Unicode, where the two can't be
2192- // properly distinguished by
2193- // block. However, because of the
2194- // overlap, this isn't really a
2195- // problem.
2196- $ b = IntlChar::BLOCK_CODE_GREEK ;
2197- break ;
2198- case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED :
2199- case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A :
2200- case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT :
2201- $ b = IntlChar::BLOCK_CODE_ETHIOPIC ;
2202- break ;
2203- case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A :
2204- case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2205- case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A :
2206- case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B :
2207- case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2208- $ b = IntlChar::BLOCK_CODE_ARABIC ;
2209- break ;
2210- case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A :
2211- case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B :
2212- $ b = IntlChar::BLOCK_CODE_CYRILLIC ;
2213- break ;
2214- case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED :
2215- $ b = IntlChar::BLOCK_CODE_BOPOMOFO ;
2216- break ;
2217- case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED :
2218- $ b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS ;
2219- break ;
2220- case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED :
2221- $ b = IntlChar::BLOCK_CODE_DEVANAGARI ;
2222- break ;
2223- case IntlChar::BLOCK_CODE_HANGUL_COMPATIBILITY_JAMO :
2224- case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A :
2225- case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B :
2226- case IntlChar::BLOCK_CODE_HANGUL_SYLLABLES :
2227- $ b = IntlChar::BLOCK_CODE_HANGUL_JAMO ;
2228- break ;
2229- case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A :
2230- case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B :
2231- $ b = IntlChar::BLOCK_CODE_MYANMAR ;
2232- break ;
2233- case IntlChar::BLOCK_CODE_CJK_STROKES :
2234- case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS :
2235- case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A :
2236- case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B :
2237- case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C :
2238- case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D :
2239- case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS :
2240- case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT :
2241- case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS :
2242- case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS :
2243- case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT :
2244- $ b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS ;
2245- break ;
2246- }
2247- if ( 0 === $ block ) {
2248- $ block = $ b ;
2249- }
2250- if ( $ block !== $ b ) {
2251- return false ;
2252- }
2253- }
2254- }
2255-
2256- return true ;
2257- }
2258-
22592131/**
22602132 * Sanitizes a username, stripping out unsafe characters.
22612133 *
@@ -2279,11 +2151,6 @@ function sanitize_user( $username, $strict = false ) {
22792151 // Remove HTML entities.
22802152 $ username = preg_replace ( '/&.+?;/ ' , '' , $ username );
22812153
2282- // If mixing different scripts, remove all but ASCII.
2283- if ( ! uses_single_unicode_script ( $ username ) ) {
2284- $ username = preg_replace ( '|[^a-z0-9 _.\-@]|i ' , '' , $ username );
2285- }
2286-
22872154 // If strict, remove reduce to letters and numbers.
22882155 if ( $ strict ) {
22892156 $ username = preg_replace ( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu ' , '' , $ username );
0 commit comments