diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index bd2d349fa20c1..a5ffe05f8ad2d 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -3637,31 +3637,13 @@ function is_email( $email, $deprecated = false ) { * Converts to ASCII from email subjects. * * @since 1.2.0 - * * @param string $subject Subject line. * @return string Converted string to ASCII. - */ -function wp_iso_descrambler( $subject ) { - /* this may only work with iso-8859-1, I'm afraid */ - if ( ! preg_match( '#\=\?(.+)\?Q\?(.+)\?\=#i', $subject, $matches ) ) { - return $subject; - } - - $subject = str_replace( '_', ' ', $matches[2] ); - return preg_replace_callback( '#\=([0-9a-f]{2})#i', '_wp_iso_convert', $subject ); -} - -/** - * Helper function to convert hex encoded chars to ASCII. - * - * @since 3.1.0 - * @access private + *@deprecated {WP_VERSION} Use {@see wp_decode_rfc2047()}. * - * @param array $matches The preg_replace_callback matches array. - * @return string Converted chars. */ -function _wp_iso_convert( $matches ) { - return chr( hexdec( strtolower( $matches[1] ) ) ); +function wp_iso_descrambler( $subject ) { + return wp_decode_rfc2047( $subject ); } /** diff --git a/src/wp-includes/rfc2047-mime.php b/src/wp-includes/rfc2047-mime.php new file mode 100644 index 0000000000000..a393ef7fa11ce --- /dev/null +++ b/src/wp-includes/rfc2047-mime.php @@ -0,0 +1,336 @@ + charset = token + * > token = 1* + * > especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / <"> / "/" / "[" / "]" / "?" / "." / "=" + * > CHAR = %00–%7F + * > CTL = %00–%1F + * > SPACE = %20 + */ + $charset_at = $encoded_word_at + 2; + $charset_length = strspn( $encoded, "!#$%&'*+-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyz{|}~", $charset_at ); + if ( $charset_length < 1 ) { + $at = $charset_at; + continue; + } + + $after_charset = $charset_at + $charset_length; + if ( $after_charset >= $end || '?' !== $encoded[ $after_charset ] ) { + $at = $after_charset; + continue; + } + + $encoding_at = $after_charset + 1; + $encoding_length = strspn( $encoded, "!#$%&'*+-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyz{|}~", $encoding_at ); + if ( $encoding_length < 1 ) { + $at = $encoding_at; + continue; + } + + $after_encoding = $encoding_at + $encoding_length; + if ( $after_encoding >= $end || '?' !== $encoded[ $after_encoding ] ) { + $at = $after_encoding; + continue; + } + + // > encoded-text = 1* + $chunk_at = $after_encoding + 1; + $chunk_length = strspn( $encoded, "!\"#$%&'()*+,-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~", $chunk_at ); + if ( $chunk_length < 1 ) { + $at = $chunk_at; + continue; + } + + $closer_at = $chunk_at + $chunk_length; + if ( $closer_at >= $end || '?' !== $encoded[ $closer_at ] || '=' !== $encoded[ $closer_at + 1 ] ) { + $at = $closer_at; + continue; + } + + $after_encoded_word = $closer_at + 2; + /* + * RFC2047 says the total length MUST be no more than 75 characters, + * but doesn’t indicate resolution when the length is greater than this. + * + * - Should this be treated as unencoded text? + * - Should this be corrupted and rejected? + * - Should this be decoded anyway? + * + * Given that the intent is to fit encoded-words within a single line of + * a header and to ensure parsers need not lookahead too far, this will + * be decoded if possible. The failure was in the encoder, not here. + */ + + if ( 1 !== $encoding_length || 1 !== strspn( $encoded, 'bBqQ', $encoding_at, 1 ) ) { + goto handle_invalid; + } + + /* + * > If the mail reader does not support the character set used, it may + * > (a) display the 'encoded-word' as ordinary text (i.e., as it appears + * > in the header), (b) make a "best effort" to display using such + * > characters as are available, or (c) substitute an appropriate message + * > indicating that the decoded text could not be displayed. + * + * > For the ISO-8859-* character sets, the mail reading program must at + * > least be able to display the characters which are also in the ASCII set. + */ + + // Shorten the charset to ignore any RFC2184/RC2231 language tag. + $charset_length = strcspn( $encoded, '*', $charset_at, $charset_length ); + + /** + * Disregard over-long charset names. This value was chosen by inspecting the + * names returned by {@see mb_convert_encoding()} and {@see mb_encoding_aliases()}. + * + * The goal is to pragmatically balance supporting all possible charsets and + * over-eagerly allocating strings, only to disregard them immediately. + */ + if ( $charset_length > 32 ) { + goto handle_invalid; + } + + /* + * Only UTF-8 is supported without conversion mechanisms. When errors are + * preserved, the ISO-8859 family’s ASCII-compatible characters will remain. + */ + $charset = substr( $encoded, $charset_at, $charset_length ); + if ( + ! in_array( strtoupper( $charset ), array( 'ASCII', 'US-ASCII', 'UTF8', 'UTF-8' ), true ) && + ! function_exists( 'mb_convert_encoding' ) && + ! function_exists( 'iconv' ) + ) { + goto handle_invalid; + } + + /* + * > A mail reader need not attempt to display the text associated with an + * > 'encoded-word' that is incorrectly formed. However, a mail reader + * > MUST NOT prevent the display or handling of a message because an + * > 'encoded-word' is incorrectly formed. + */ + + $encoding = $encoded[ $encoding_at ]; + if ( 'b' === $encoding || 'B' === $encoding ) { + $decoded_chunk = base64_decode( substr( $encoded, $chunk_at, $chunk_length ), false ); + if ( false === $decoded_chunk ) { + goto handle_invalid; + } + } else { + // @todo There is no error-handling indication here for the Q decoding. + $failed_decode = false; + $decoded_chunk = substr( $encoded, $chunk_at, $chunk_length ); + $decoded_chunk = strtr( $decoded_chunk, '_', ' ' ); + $decoded_chunk = preg_replace_callback( + '/=[0-9A-F]{2}|=/', // Lower-case are not allowed. + function ( $matches ) use ( &$failed_decode ) { + if ( '=' === $matches[0] ) { + $failed_decode = true; + return $matches[0]; + } + return hex2bin( substr( $matches[0], 1, 2 ) ); + }, + $decoded_chunk + ); + + if ( $failed_decode ) { + goto handle_invalid; + } + } + + // Re-encode into UTF-8. + if ( in_array( strtoupper( $charset ), array( 'ASCII', 'US-ASCII', 'UTF8', 'UTF-8' ), true ) ) { + // Skip re-encoding for this one. + } elseif ( function_exists( 'mb_convert_encoding' ) ) { + try { + $decoded_chunk = mb_convert_encoding( $decoded_chunk, 'UTF-8', $charset ); + } catch ( \Throwable $exception ) { + goto handle_invalid; + } + } elseif ( function_exists( 'iconv' ) ) { + $decoded_chunk = iconv( $charset, 'UTF-8', $decoded_chunk ); + } + + // Verify the encoding. + if ( false === $decoded_chunk || ! wp_is_valid_utf8( $decoded_chunk ) ) { + goto handle_invalid; + } + + // Append the decoded chunk. + $prefix_length = $encoded_word_at - $was_at; + if ( $prefix_length === 0 || rfc2047_only_LWS( $encoded, $was_at, $prefix_length ) ) { + $decoded .= $decoded_chunk; + } else { + $prefix = substr( $encoded, $was_at, $prefix_length ); + $decoded .= "{$prefix}{$decoded_chunk}"; + } + $was_at = $after_encoded_word; + $at = $was_at; + continue; + + handle_invalid: + $at = $after_encoded_word; + switch ( $errors ) { + case 'bail-on-error': + restore_error_handler(); + return null; + + case 'preserve-errors': + break; + + case 'replace-errors': + $prefix_length = $encoded_word_at - $was_at; + if ( $prefix_length === 0 || rfc2047_only_LWS( $encoded, $was_at, $prefix_length ) ) { + $decoded .= "\u{FFFD}"; + } else { + $prefix = substr( $encoded, $was_at, $prefix_length ); + $decoded .= "{$prefix}\u{FFFD}"; + } + $was_at = $after_encoded_word; + break; + + default: + _doing_it_wrong( + __FUNCTION__, + "Use only one of 'preserve-errors' or 'replace-errors' for error-handling.", + '{WP_VERSION}' + ); + restore_error_handler(); + return null; + } + } + + if ( $at === 0 ) { + return $encoded; + } + + $decoded .= substr( $encoded, $was_at ); + + restore_error_handler(); + + return $decoded; +} + +/** + * Determines if a span of text represents only linear white space. + * + * @since {WP_VERSION} + * @access private + * + * @param string $string + * @param int $start + * @param int $length + * @return bool + */ +function rfc2047_only_LWS( $string, $start, $length ) { + $at = $start; + $end = $start + $length; + $one = false; + + while ( $at < $end ) { + $had_crlf = false; + + // Advance past one optional CRLF. + if ( $at + 1 < $end && "\r" === $string[ $at ] && "\n" === $string[ $at + 1 ] ) { + $had_crlf = true; + $at += 2; + } + + // Advance past any SPACE / HTAB + $horizontal_spaces = strspn( $string, " \t", $at, $end - $at ); + + if ( 0 === $horizontal_spaces ) { + return ! $had_crlf && $one && $at === $end; + } + + $one = true; + $at += $horizontal_spaces; + } + + return $one; +} diff --git a/src/wp-mail.php b/src/wp-mail.php index 3f39de4264fe5..f9f9166ff0349 100644 --- a/src/wp-mail.php +++ b/src/wp-mail.php @@ -124,11 +124,7 @@ $subject = trim( $line ); $subject = substr( $subject, 9, strlen( $subject ) - 9 ); // Captures any text in the subject before $phone_delim as the subject. - if ( function_exists( 'iconv_mime_decode' ) ) { - $subject = iconv_mime_decode( $subject, 2, get_option( 'blog_charset' ) ); - } else { - $subject = wp_iso_descrambler( $subject ); - } + $subject = wp_decode_rfc2047( $subject ); $subject = explode( $phone_delim, $subject ); $subject = $subject[0]; } diff --git a/src/wp-settings.php b/src/wp-settings.php index 8ad02ffe8fd8b..22e1f9b70ccbb 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -111,6 +111,7 @@ // Load early WordPress files. require ABSPATH . WPINC . '/class-wp-list-util.php'; require ABSPATH . WPINC . '/class-wp-token-map.php'; +require ABSPATH . WPINC . '/rfc2047-mime.php'; require ABSPATH . WPINC . '/utf8.php'; require ABSPATH . WPINC . '/formatting.php'; require ABSPATH . WPINC . '/meta.php'; diff --git a/tests/phpunit/tests/formatting/wpDecodeRfc2047.php b/tests/phpunit/tests/formatting/wpDecodeRfc2047.php new file mode 100644 index 0000000000000..7e7f3be293a48 --- /dev/null +++ b/tests/phpunit/tests/formatting/wpDecodeRfc2047.php @@ -0,0 +1,384 @@ +assertSame( + $decoded, + $result, + 'Failed to properly decode input text.' + ); + } else { + $this->assertNull( + $result, + 'Improperly decoded invalid input.' + ); + } + } + + public static function data_rfc2047_strings() { + return array( + 'simple_q_encoded_ascii' => array( + '=?US-ASCII?Q?Keith_Moore?=', + 'Keith Moore' + ), + 'simple_b_encoded_ascii' => array( + '=?US-ASCII?B?SGVsbG8gV29ybGQ=?=', + 'Hello World' + ), + 'utf8_q_encoded_text' => array( + '=?UTF-8?Q?Caf=C3=A9?=', + 'Café' + ), + 'utf8_b_encoded_text' => array( + '=?UTF-8?B?4piVIFN0cmluZw==?=', + '☕ String' + ), + 'iso_8859_1_q_encoded' => array( + '=?ISO-8859-1?Q?Andr=E9?=', + 'André' + ), + 'iso_8859_1_b_encoded' => array( + '=?ISO-8859-1?B?QW5kcuk=?=', + 'André' + ), + 'shift_jis_q_encoded' => array( + '=?SHIFT_JIS?Q?=93=FA=96=7B=8C=EA?=', + '日本語' + ), + 'shift_jis_b_encoded' => array( + '=?SHIFT_JIS?B?k/qWe4zqg2WDWINn?=', + '日本語テスト' + ), + 'multiple_encodings_in_one' => array( + '=?UTF-8?Q?Caf=C3=A9?= and =?US-ASCII?B?SGVsbG8=?=', + 'Café and Hello' + ), + 'underscore_to_space_q' => array( + '=?US-ASCII?Q?Mary_Johnson?=', + 'Mary Johnson' + ), + 'equals_in_q_encoding' => array( + '=?US-ASCII?Q?foo=3Dbar?=', + 'foo=bar' + ), + 'question_mark_in_q_encoding' => array( + '=?US-ASCII?Q?What=3F?=', + 'What?' + ), + 'invalid_charset' => array( + '=?INVALID-CHARSET?Q?Test?=', + null + ), + 'missing_encoding_type' => array( + '=?UTF-8?Test?=', + null + ), + 'invalid_encoding_indicator' => array( + '=?UTF-8?X?VGVzdA==?=', + null + ), + 'malformed_b_encoding' => array( + '=?UTF-8?B?Invalid_Base64???', + null + ), + 'missing_closing_delimiter' => array( + '=?UTF-8?Q?Missing_End', + null + ), + 'empty_encoded_word' => array( + '=?UTF-8?Q??=', + '' + ), + 'only_equals_signs' => array( + '=?UTF-8?Q?=3D=3D?=', + '==' + ), + 'lowercase_q_encoding' => array( + '=?UTF-8?q?lowercase?=', + 'lowercase' + ), + 'lowercase_b_encoding' => array( + '=?UTF-8?b?bG93ZXJjYXNl?=', + 'lowercase' + ), + 'mixed_case_encoding' => array( + '=?UTF-8?Q?Mixed_Case?=', + 'Mixed Case' + ), + 'nested_encoded_words' => array( + '=?UTF-8?Q?=3F=3F?=', + '?=?' + ), + 'whitespace_around_encoded' => array( + ' =?UTF-8?Q?Padded?= ', + ' Padded ' + ), + 'consecutive_encoded_words' => array( + '=?UTF-8?Q?First?==?UTF-8?Q?Second?=', + 'FirstSecond' + ), + 'space_between_consecutive_encoded_words' => array( + '=?UTF-8?Q?First?= =?UTF-8?Q?Second?=', + 'FirstSecond' + ), + 'newline_between_encoded_words' => array( + "=?UTF-8?Q?First?=\n =?UTF-8?Q?Second?=", + 'FirstSecond' + ), + 'tab_between_encoded_words' => array( + "=?UTF-8?Q?First?=\t =?UTF-8?Q?Second?=", + 'FirstSecond' + ), + 'long_utf8_string_b' => array( + '=?UTF-8?B?8J+YgCBUaGlzIGlzIGEgbG9uZyBzdHJpbmcgdGhhdCBzaG91bGQgYmUgZW5jb2RlZCBpbiBCYXNlNjQ=?=', + '☕ This is a long string that should be encoded in Base64' + ), + 'long_utf8_string_q' => array( + '=?UTF-8?Q?Long_string_with_=C3=A9_and_=E2=98=95?=', + 'Long string with é and ☕' + ), + 'special_chars_in_q' => array( + '=?UTF-8?Q?Special_Chars_=28=29_=2B_=2D_=5F?=', + 'Special Chars () + - _' + ), + 'space_at_end_of_q' => array( + '=?US-ASCII?Q?Space_at_end_?=', + 'Space at end ' + ), + 'space_at_beginning_of_q' => array( + '=?US-ASCII?Q?_Space_at_beginning?=', + ' Space at beginning' + ), + 'only_spaces_in_q' => array( + '=?US-ASCII?Q?_?=', + ' ' + ), + 'control_characters_q' => array( + '=?US-ASCII?Q?Line1=0ALine2?=', + "Line1\nLine2" + ), + 'backslash_in_q' => array( + '=?US-ASCII?Q?Backslash_=5C?=', + 'Backslash \\' + ), + 'percent_sign_in_q' => array( + '=?US-ASCII?Q?Percent_=25?=', + 'Percent %' + ), + 'dollar_sign_in_q' => array( + '=?US-ASCII?Q?Dollar_=24?=', + 'Dollar $' + ), + 'at_symbol_in_q' => array( + '=?US-ASCII?Q?At_=40?=', + 'At @' + ), + 'hash_symbol_in_q' => array( + '=?US-ASCII?Q?Hash_=23?=', + 'Hash #' + ), + 'ampersand_in_q' => array( + '=?US-ASCII?Q?Ampersand_=26?=', + 'Ampersand &' + ), + 'asterisk_in_q' => array( + '=?US-ASCII?Q?Asterisk_=2A?=', + 'Asterisk *' + ), + 'exclamation_in_q' => array( + '=?US-ASCII?Q?Exclamation_=21?=', + 'Exclamation !' + ), + 'quote_in_q' => array( + '=?US-ASCII?Q?Quote_=22?=', + 'Quote "' + ), + 'single_quote_in_q' => array( + '=?US-ASCII?Q?Single_=27?=', + "Single '" + ), + 'colon_in_q' => array( + '=?US-ASCII?Q?Colon_=3A?=', + 'Colon :' + ), + 'semicolon_in_q' => array( + '=?US-ASCII?Q?Semicolon_=3B?=', + 'Semicolon ;' + ), + 'comma_in_q' => array( + '=?US-ASCII?Q?Comma_=2C?=', + 'Comma ,' + ), + 'period_in_q' => array( + '=?US-ASCII?Q?Period_=2E?=', + 'Period .' + ), + 'slash_in_q' => array( + '=?US-ASCII?Q?Slash_=2F?=', + 'Slash /' + ), + 'less_than_in_q' => array( + '=?US-ASCII?Q?Less_Than_=3C?=', + 'Less Than <' + ), + 'greater_than_in_q' => array( + '=?US-ASCII?Q?Greater_Than_=3E?=', + 'Greater Than >' + ), + 'brackets_in_q' => array( + '=?US-ASCII?Q?Brackets_=5B_=5D?=', + 'Brackets [ ]' + ), + 'curly_braces_in_q' => array( + '=?US-ASCII?Q?Curly_=7B_=7D?=', + 'Curly { }' + ), + 'pipe_in_q' => array( + '=?US-ASCII?Q?Pipe_=7C?=', + 'Pipe |' + ), + 'tilde_in_q' => array( + '=?US-ASCII?Q?Tilde_=7E?=', + 'Tilde ~' + ), + 'caret_in_q' => array( + '=?US-ASCII?Q?Caret_=5E?=', + 'Caret ^' + ), + 'accent_in_q' => array( + '=?US-ASCII?Q?Accent_=60?=', + 'Accent `' + ), + 'invalid_hex_sequence_q' => array( + '=?US-ASCII?Q?Invalid=XX?=', + null + ), + 'incomplete_hex_sequence_q' => array( + '=?US-ASCII?Q?Incomplete=X?=', + null + ), + 'hex_sequence_with_lowercase_q' => array( + '=?US-ASCII?Q?Lowercase_hex=c3=a9?=', + 'Lowercase hexé' + ), + 'non_ascii_in_b_encoding' => array( + '=?UTF-8?B?8J+YgA==?=', + '☕' + ), + 'b_encoding_with_whitespace' => array( + '=?UTF-8?B?SGVsb G8=?=', + null + ), + 'b_encoding_with_invalid_chars' => array( + '=?UTF-8?B?SGVsbG8@V29ybGQ=?=', + null + ), + 'empty_charset' => array( + '=?UTF-8?Q??=', + '' + ), + 'missing_charset' => array( + '?Q?Test?=', + null + ), + 'charset_with_spaces' => array( + '=? UTF-8 ?Q?Test?=', + null + ), + 'charset_with_dashes' => array( + '=?UTF-8-with-dashes?Q?Test?=', + null + ), + 'multiple_question_marks_in_data' => array( + '=?US-ASCII?Q?Multiple=3F=3F=3F?=', + 'Multiple???' + ), + 'encoded_word_at_end_of_string' => array( + 'Start =?UTF-8?Q?End?=', + 'Start End' + ), + 'encoded_word_at_beginning_of_string' => array( + '=?UTF-8?Q?Start?= End', + 'Start End' + ), + 'only_encoded_word' => array( + '=?UTF-8?Q?Only_Word?=', + 'Only Word' + ), + 'invalid_base64_padding_b' => array( + '=?UTF-8?B?SGVsbG8=?', + null + ), + 'extra_equals_in_b_encoding' => array( + '=?UTF-8?B?SGVsbG8gV29ybGQ====?=', + null + ), + 'mixed_valid_invalid_encoding' => array( + '=?UTF-8?Q?Valid?= and =?INVALID?Q?Invalid?=', + null + ), + 'unclosed_encoded_word' => array( + '=?UTF-8?Q?Unclosed', + null + ), + 'unclosed_with_spaces' => array( + ' =?UTF-8?Q?Unclosed ', + null + ), + 'encoded_word_with_extra_equals' => array( + '=?UTF-8?Q?Extra=3D?=', + 'Extra=' + ), + 'q_encoding_with_line_break' => array( + "=?US-ASCII?Q?Line1=0ALine2?=", + "Line1\nLine2" + ), + 'b_encoding_multiline' => array( + '=?UTF-8?B?VGhpcyBpcyBhIHRlc3QKbXVsdGlsaW5l?=', + "This is a test\nmultiline" + ), + 'utf8_emoji_b' => array( + '=?UTF-8?B?4q2QIOKtkA==?=', + '⭐ ⭐' + ), + 'utf8_emoji_q' => array( + '=?UTF-8?Q?=F0=9F=98=8A_=F0=9F=98=8A?=', + '😊 😊' + ), + 'chinese_characters_b' => array( + '=?UTF-8?B?5Lit5paH5rWL6K+V?=', + '中文测试' + ), + 'chinese_characters_q' => array( + '=?UTF-8?Q?=E4=B8=AD=E6=96=87=E6=B5=8B=E8=AF=95?=', + '中文测试' + ), + 'georgian_characters_b' => array( + '=?UTF-8?B?4YOQ4YOR4YOS4YOT4YOU4YOV4YOW4YOX4YOY?=', + 'აბგდევზთი' + ), + 'georgian_characters_q' => array( + '=?UTF-8?Q?=E1=83=90=E1=83=91=E1=83=92=E1=83=93=E1=83=94=E1=83=95=E1=83=96=E1=83=97=E1=83=98?=', + 'აბგდევზთი' + ), + ); + } +}