General: Drop too-verbose protection against homograph usernames.

arnt · arnt · commit 2450987146eb · 2025-12-09T15:55:23.000+01:00
Props dmsnell
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
@@ -2128,134 +2128,6 @@ function sanitize_file_name( $filename ) {
 	return apply_filters( 'sanitize_file_name', $filename, $filename_raw );
 }
 
-/**
- * Validates that a string contains only characters from a single unicode script.
- *
- * The function only considers alphabetic characters. It returns true if a string
- * contains no more than one unicode script, and false if it contains two or more.
- * An empty string is considered to contain no scripts, and thus returns true.
- *
- * IntlChar does not support returning the script property defined by
- * https://www.unicode.org/reports/tr24/, so this implementation uses a workaround.
- * It maps the known extension blocks ("latin extended a" etc) to the first block
- * for that script, and then checks that the string uses only a single block.
- *
- * This works for the scripts currently in Unicode, and should continue to work for
- * future scripts as long as each new script needs a single code block. While older
- * scripts may have multiple blocks, the Unicode committee has grown better at
- * estimating sizes high enough so that only one block is needed.
- *
- * @since 6.9.0
- *
- * @param string $input A string to check.
- * @return bool True if all letters in the string belong to the same unicode
- *              script or if the string is empty.
- *              False if letters from two more more scripts are included.
- */
-function uses_single_unicode_script( string $input ): bool {
-	if ( '' === $input ) {
-		return true;
-	}
-
-	if ( version_compare( PHP_VERSION, '7.4.0', '<' ) ) {
-		// Since mb_str_split is not available in PHP < 7.4 we can only check ASCII characters.
-		return (bool) preg_match( '/^[a-zA-Z0-9 _.\-@]+$/i', $input );
-	}
-
-	$block = 0;
-	// phpcs:ignore PHPCompatibility.FunctionUse.NewFunctions.mb_str_splitFound -- old versions of PHP are handled above
-	foreach ( mb_str_split( $input ) as $cp ) {
-		if ( IntlChar::isalpha( $cp ) ) {
-			$b = IntlChar::getBlockCode( $cp );
-			switch ( $b ) {
-				case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT:
-					// fall through
-				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A:
-					// fall through
-				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B:
-				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C:
-				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D:
-				case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc
-				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL:
-					$b = IntlChar::BLOCK_CODE_BASIC_LATIN;
-					break;
-				case IntlChar::BLOCK_CODE_GREEK_EXTENDED:
-				case IntlChar::BLOCK_CODE_COPTIC:
-				case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS:
-					// Greek and coptic overlap. Coptic
-					// looks like Greek upper case, so
-					// readers of Greek can read Coptic,
-					// but readers of Coptic can't
-					// necessarily read Greek. This led to
-					// an unfortunate situation in
-					// Unicode, where the two can't be
-					// properly distinguished by
-					// block. However, because of the
-					// overlap, this isn't really a
-					// problem.
-					$b = IntlChar::BLOCK_CODE_GREEK;
-					break;
-				case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED:
-				case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A:
-				case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT:
-					$b = IntlChar::BLOCK_CODE_ETHIOPIC;
-					break;
-				case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A:
-				case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
-				case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A:
-				case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B:
-				case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
-					$b = IntlChar::BLOCK_CODE_ARABIC;
-					break;
-				case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A:
-				case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B:
-					$b = IntlChar::BLOCK_CODE_CYRILLIC;
-					break;
-				case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED:
-					$b = IntlChar::BLOCK_CODE_BOPOMOFO;
-					break;
-				case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED:
-					$b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS;
-					break;
-				case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED:
-					$b = IntlChar::BLOCK_CODE_DEVANAGARI;
-					break;
-				case IntlChar::BLOCK_CODE_HANGUL_COMPATIBILITY_JAMO:
-				case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A:
-				case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B:
-				case IntlChar::BLOCK_CODE_HANGUL_SYLLABLES:
-					$b = IntlChar::BLOCK_CODE_HANGUL_JAMO;
-					break;
-				case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A:
-				case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B:
-					$b = IntlChar::BLOCK_CODE_MYANMAR;
-					break;
-				case IntlChar::BLOCK_CODE_CJK_STROKES:
-				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS:
-				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
-				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
-				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
-				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
-				case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS:
-				case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT:
-				case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS:
-				case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS:
-				case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
-					$b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS;
-					break;
-			}
-			if ( 0 === $block ) {
-				$block = $b;
-			}
-			if ( $block !== $b ) {
-				return false;
-			}
-		}
-	}
-
-	return true;
-}
-
 /**
  * Sanitizes a username, stripping out unsafe characters.
  *
@@ -2279,11 +2151,6 @@ function sanitize_user( $username, $strict = false ) {
 	// Remove HTML entities.
 	$username = preg_replace( '/&.+?;/', '', $username );
 
-	// If mixing different scripts, remove all but ASCII.
-	if ( ! uses_single_unicode_script( $username ) ) {
-		$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
-	}
-
 	// If strict, remove reduce to letters and numbers.
 	if ( $strict ) {
 		$username = preg_replace( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu', '', $username );
diff --git a/tests/phpunit/tests/formatting/sanitizeUser.php b/tests/phpunit/tests/formatting/sanitizeUser.php
@@ -89,19 +89,4 @@ public function test_accepts_west_african_latin() {
 		$this->assertSame( $expected, sanitize_user( $expected ) );
 		$this->assertSame( $expected, sanitize_user( $encoded ) );
 	}
-
-	/*
-	 * Some people are worried about using letters that look alike
-	 * from different alphabets, for example the Cyrillic V looks
-	 * exactly like the Latin B. If any user names use confusable
-	 * letters like that pair, people are sure to have trouble
-	 * logging in, so we try to prevent people from painting
-	 * themselves into that corner.
-	 *
-	 * @ticket 31992
-	 */
-
-	public function test_blocks_latin_cyrillic_mixed_name() {
-		$this->assertSame( 'arn', sanitize_user( 'arn%D1%82' ) );
-	}
 }
diff --git a/tests/phpunit/tests/user.php b/tests/phpunit/tests/user.php
@@ -1118,8 +1118,6 @@ public function test_validate_utf8_usernames() {
 		}
 		/* WordPress approves of drab grey (grå) Norwegian weather */
 		$this->assertTrue( validate_username( 'grå' ) );
-		/* Latin I, Cyrillic V like latin B, Latin M */
-		$this->assertFalse( validate_username( 'IВM' ) );
 		/* Three Cyrillic letters */
 		$this->assertTrue( validate_username( 'ІВМ' ) );
 		/* A metal umlaut fails because validate_username is

Original file line number	Diff line number	Diff line change
`@@ -1118,8 +1118,6 @@ public function test_validate_utf8_usernames() {`
`1118`	`1118`	`}`
`1119`	`1119`	`/* WordPress approves of drab grey (grå) Norwegian weather */`
`1120`	`1120`	`$this->assertTrue( validate_username( 'grå' ) );`
`1121`		`- /* Latin I, Cyrillic V like latin B, Latin M */`
`1122`		`- $this->assertFalse( validate_username( 'IВM' ) );`
`1123`	`1121`	`/* Three Cyrillic letters */`
`1124`	`1122`	`$this->assertTrue( validate_username( 'ІВМ' ) );`
`1125`	`1123`	`/* A metal umlaut fails because validate_username is`