Skip to content

Commit 39fb139

Browse files
committed
MIME: Add RFC2047 text decoder.
Questions arise around unspecified failure behaviors. - What if the syntax is obviously supposed to be an encoding but technically isn’t? For example, it’s missing a closing '?' It may be computationally heavy to _guess_ if something is broken syntax, so some failures are ambiguous if they should copy the input plaintext or return null. - What do other high-quality libraries do with errors?
1 parent 2ae6561 commit 39fb139

File tree

5 files changed

+725
-26
lines changed

5 files changed

+725
-26
lines changed

src/wp-includes/formatting.php

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3637,31 +3637,13 @@ function is_email( $email, $deprecated = false ) {
36373637
* Converts to ASCII from email subjects.
36383638
*
36393639
* @since 1.2.0
3640-
*
36413640
* @param string $subject Subject line.
36423641
* @return string Converted string to ASCII.
3643-
*/
3644-
function wp_iso_descrambler( $subject ) {
3645-
/* this may only work with iso-8859-1, I'm afraid */
3646-
if ( ! preg_match( '#\=\?(.+)\?Q\?(.+)\?\=#i', $subject, $matches ) ) {
3647-
return $subject;
3648-
}
3649-
3650-
$subject = str_replace( '_', ' ', $matches[2] );
3651-
return preg_replace_callback( '#\=([0-9a-f]{2})#i', '_wp_iso_convert', $subject );
3652-
}
3653-
3654-
/**
3655-
* Helper function to convert hex encoded chars to ASCII.
3656-
*
3657-
* @since 3.1.0
3658-
* @access private
3642+
*@deprecated {WP_VERSION} Use {@see wp_decode_rfc2047()}.
36593643
*
3660-
* @param array $matches The preg_replace_callback matches array.
3661-
* @return string Converted chars.
36623644
*/
3663-
function _wp_iso_convert( $matches ) {
3664-
return chr( hexdec( strtolower( $matches[1] ) ) );
3645+
function wp_iso_descrambler( $subject ) {
3646+
return wp_decode_rfc2047( $subject );
36653647
}
36663648

36673649
/**

src/wp-includes/rfc2047-mime.php

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
<?php
2+
3+
/**
4+
* Implements the decoder from RFC 2047:
5+
* MIME Part 3: Message Header Extensions for Non-ASCII Text.
6+
*
7+
* This module contains decoding functions for supported MIME
8+
* encodings as are used with email servers which don’t support
9+
* or haven’t activated UTF-8 support.
10+
*
11+
* @see https://www.rfc-editor.org/rfc/rfc2047
12+
*
13+
* @package WordPress
14+
* @subpackage rfc2044
15+
*/
16+
17+
/**
18+
* Decodes text potentially containing RFC2047 MIME encoded words.
19+
* Returns decoded text as UTF-8, if supported, else `null`.
20+
*
21+
* Example:
22+
*
23+
* // Quoted forms have non-printable ASCII encoded as octets.
24+
* 'this is some text' === wp_decode_rfc2047( '=?iso-8859-1?q?this=20is_some=20text?=' );
25+
* '👌' === wp_decode_rfc2047( '=?utf-8?q?=F0=9F=91=8C?=' );
26+
*
27+
* // Binary forms are base64-encoded.
28+
* '👌' === wp_decode_rfc2047( '=?utf-8?B??=8J+RjA==?=' );
29+
* 'םולש ןב ילטפנ' === wp_decode_rfc2047( '=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=' );
30+
*
31+
* // Character sets are re-encoded into UTF-8
32+
* '100¥' === wp_decode_rfc2047( '=?iso-8859-1?Q?500=A5?=' );
33+
* '🏴󠁧󠁢󠁥󠁮󠁧󠁿' === wp_decode_rfc2047( '=?GB-18030?Q?=949=C82=D36=A01=D36=9F6=D36=9F9=D36=A08=D36=A01=D36=A25?=' );
34+
*
35+
* // Linear white-space is collapsed.
36+
* 'ab c d e' === wp_decode_rfc2047( '=?ASCII?Q?a?= =?ASCII?Q?b?= c d=?ASCII?Q?=20?==?ASCII?Q?e?=' )
37+
*
38+
* // Error-handling is up to the call site.
39+
* '=?UTF-8?Q?=6f?=' === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=' );
40+
* '=?UTF-8?Q?=6f?=' === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=', 'preserve-errors' );
41+
* '�' === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=', 'replace-errors' );
42+
* null === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=', 'bail-on-error' );
43+
*
44+
* // Invalid character encodings are errors.
45+
* null === wp_decode_rfc2047( '=?UTF-8?Q?=C0?=', 'bail-on-error' );
46+
*
47+
* @see https://www.rfc-editor.org/rfc/rfc2047
48+
*
49+
* @since {WP_VERSION}
50+
*
51+
* @param string $encoded US-ASCII text potentially containing MIME encoded words.
52+
* @param ?('preserve-errors'|'replace-errors'|'bail-on-error') $errors Optional. How to handle invalid encoded words.
53+
* Default is to preserve invalid encoded words as plaintext.
54+
* @return string Decoded string in UTF-8, if supported, else `null`.
55+
*/
56+
function wp_decode_rfc2047( $encoded, $errors = 'preserve-errors' ) {
57+
/**
58+
* {@see iconv_mime_decode()} which does not give control over error-handling
59+
* at the granularity necessary for this decoder.
60+
*/
61+
62+
$decoded = '';
63+
$end = strlen( $encoded );
64+
$at = 0;
65+
$was_at = 0;
66+
67+
set_error_handler(
68+
static function ( $errno, $errstr ) {
69+
if (
70+
str_starts_with( $errstr, 'mb_convert_encoding():' ) ||
71+
str_starts_with( $errstr, 'iconv():' )
72+
) {
73+
throw new Error( $errstr );
74+
}
75+
76+
return false;
77+
},
78+
E_WARNING
79+
);
80+
81+
while ( $at < $end ) {
82+
$encoded_word_at = strpos( $encoded, '=?', $at );
83+
if ( $encoded_word_at === false ) {
84+
break;
85+
}
86+
87+
/*
88+
* > charset = token
89+
* > token = 1*<Any CHAR except SPACE, CTLs, and especials>
90+
* > especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / <"> / "/" / "[" / "]" / "?" / "." / "="
91+
* > CHAR = %00–%7F
92+
* > CTL = %00–%1F
93+
* > SPACE = %20
94+
*/
95+
$charset_at = $encoded_word_at + 2;
96+
$charset_length = strspn( $encoded, "!#$%&'*+-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyz{|}~", $charset_at );
97+
if ( $charset_length < 1 ) {
98+
$at = $charset_at;
99+
continue;
100+
}
101+
102+
$after_charset = $charset_at + $charset_length;
103+
if ( $after_charset >= $end || '?' !== $encoded[ $after_charset ] ) {
104+
$at = $after_charset;
105+
continue;
106+
}
107+
108+
$encoding_at = $after_charset + 1;
109+
$encoding_length = strspn( $encoded, "!#$%&'*+-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyz{|}~", $encoding_at );
110+
if ( $encoding_length < 1 ) {
111+
$at = $encoding_at;
112+
continue;
113+
}
114+
115+
$after_encoding = $encoding_at + $encoding_length;
116+
if ( $after_encoding >= $end || '?' !== $encoded[ $after_encoding ] ) {
117+
$at = $after_encoding;
118+
continue;
119+
}
120+
121+
// > encoded-text = 1*<Any printable ASCII character other than "?" or SPACE>
122+
$chunk_at = $after_encoding + 1;
123+
$chunk_length = strspn( $encoded, "!\"#$%&'()*+,-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~", $chunk_at );
124+
if ( $chunk_length < 1 ) {
125+
$at = $chunk_at;
126+
continue;
127+
}
128+
129+
$closer_at = $chunk_at + $chunk_length;
130+
if ( $closer_at >= $end || '?' !== $encoded[ $closer_at ] || '=' !== $encoded[ $closer_at + 1 ] ) {
131+
$at = $closer_at;
132+
continue;
133+
}
134+
135+
$after_encoded_word = $closer_at + 2;
136+
/*
137+
* RFC2047 says the total length MUST be no more than 75 characters,
138+
* but doesn’t indicate resolution when the length is greater than this.
139+
*
140+
* - Should this be treated as unencoded text?
141+
* - Should this be corrupted and rejected?
142+
* - Should this be decoded anyway?
143+
*
144+
* Given that the intent is to fit encoded-words within a single line of
145+
* a header and to ensure parsers need not lookahead too far, this will
146+
* be decoded if possible. The failure was in the encoder, not here.
147+
*/
148+
149+
if ( 1 !== $encoding_length || 1 !== strspn( $encoded, 'bBqQ', $encoding_at, 1 ) ) {
150+
goto handle_invalid;
151+
}
152+
153+
/*
154+
* > If the mail reader does not support the character set used, it may
155+
* > (a) display the 'encoded-word' as ordinary text (i.e., as it appears
156+
* > in the header), (b) make a "best effort" to display using such
157+
* > characters as are available, or (c) substitute an appropriate message
158+
* > indicating that the decoded text could not be displayed.
159+
*
160+
* > For the ISO-8859-* character sets, the mail reading program must at
161+
* > least be able to display the characters which are also in the ASCII set.
162+
*/
163+
164+
// Shorten the charset to ignore any RFC2184/RC2231 language tag.
165+
$charset_length = strcspn( $encoded, '*', $charset_at, $charset_length );
166+
167+
/**
168+
* Disregard over-long charset names. This value was chosen by inspecting the
169+
* names returned by {@see mb_convert_encoding()} and {@see mb_encoding_aliases()}.
170+
*
171+
* The goal is to pragmatically balance supporting all possible charsets and
172+
* over-eagerly allocating strings, only to disregard them immediately.
173+
*/
174+
if ( $charset_length > 32 ) {
175+
goto handle_invalid;
176+
}
177+
178+
/*
179+
* Only UTF-8 is supported without conversion mechanisms. When errors are
180+
* preserved, the ISO-8859 family’s ASCII-compatible characters will remain.
181+
*/
182+
$charset = substr( $encoded, $charset_at, $charset_length );
183+
if (
184+
! in_array( strtoupper( $charset ), array( 'ASCII', 'US-ASCII', 'UTF8', 'UTF-8' ), true ) &&
185+
! function_exists( 'mb_convert_encoding' ) &&
186+
! function_exists( 'iconv' )
187+
) {
188+
goto handle_invalid;
189+
}
190+
191+
/*
192+
* > A mail reader need not attempt to display the text associated with an
193+
* > 'encoded-word' that is incorrectly formed. However, a mail reader
194+
* > MUST NOT prevent the display or handling of a message because an
195+
* > 'encoded-word' is incorrectly formed.
196+
*/
197+
198+
$encoding = $encoded[ $encoding_at ];
199+
if ( 'b' === $encoding || 'B' === $encoding ) {
200+
$decoded_chunk = base64_decode( substr( $encoded, $chunk_at, $chunk_length ), false );
201+
if ( false === $decoded_chunk ) {
202+
goto handle_invalid;
203+
}
204+
} else {
205+
// @todo There is no error-handling indication here for the Q decoding.
206+
$failed_decode = false;
207+
$decoded_chunk = substr( $encoded, $chunk_at, $chunk_length );
208+
$decoded_chunk = strtr( $decoded_chunk, '_', ' ' );
209+
$decoded_chunk = preg_replace_callback(
210+
'/=[0-9A-F]{2}|=/', // Lower-case are not allowed.
211+
function ( $matches ) use ( &$failed_decode ) {
212+
if ( '=' === $matches[0] ) {
213+
$failed_decode = true;
214+
return $matches[0];
215+
}
216+
return hex2bin( substr( $matches[0], 1, 2 ) );
217+
},
218+
$decoded_chunk
219+
);
220+
221+
if ( $failed_decode ) {
222+
goto handle_invalid;
223+
}
224+
}
225+
226+
// Re-encode into UTF-8.
227+
if ( in_array( strtoupper( $charset ), array( 'ASCII', 'US-ASCII', 'UTF8', 'UTF-8' ), true ) ) {
228+
// Skip re-encoding for this one.
229+
} elseif ( function_exists( 'mb_convert_encoding' ) ) {
230+
try {
231+
$decoded_chunk = mb_convert_encoding( $decoded_chunk, 'UTF-8', $charset );
232+
} catch ( \Throwable $exception ) {
233+
goto handle_invalid;
234+
}
235+
} elseif ( function_exists( 'iconv' ) ) {
236+
$decoded_chunk = iconv( $charset, 'UTF-8', $decoded_chunk );
237+
}
238+
239+
// Verify the encoding.
240+
if ( false === $decoded_chunk || ! wp_is_valid_utf8( $decoded_chunk ) ) {
241+
goto handle_invalid;
242+
}
243+
244+
// Append the decoded chunk.
245+
$prefix_length = $encoded_word_at - $was_at;
246+
if ( $prefix_length === 0 || rfc2047_only_LWS( $encoded, $was_at, $prefix_length ) ) {
247+
$decoded .= $decoded_chunk;
248+
} else {
249+
$prefix = substr( $encoded, $was_at, $prefix_length );
250+
$decoded .= "{$prefix}{$decoded_chunk}";
251+
}
252+
$was_at = $after_encoded_word;
253+
$at = $was_at;
254+
continue;
255+
256+
handle_invalid:
257+
$at = $after_encoded_word;
258+
switch ( $errors ) {
259+
case 'bail-on-error':
260+
restore_error_handler();
261+
return null;
262+
263+
case 'preserve-errors':
264+
break;
265+
266+
case 'replace-errors':
267+
$prefix_length = $encoded_word_at - $was_at;
268+
if ( $prefix_length === 0 || rfc2047_only_LWS( $encoded, $was_at, $prefix_length ) ) {
269+
$decoded .= "\u{FFFD}";
270+
} else {
271+
$prefix = substr( $encoded, $was_at, $prefix_length );
272+
$decoded .= "{$prefix}\u{FFFD}";
273+
}
274+
$was_at = $after_encoded_word;
275+
break;
276+
277+
default:
278+
_doing_it_wrong(
279+
__FUNCTION__,
280+
"Use only one of 'preserve-errors' or 'replace-errors' for error-handling.",
281+
'{WP_VERSION}'
282+
);
283+
restore_error_handler();
284+
return null;
285+
}
286+
}
287+
288+
if ( $at === 0 ) {
289+
return $encoded;
290+
}
291+
292+
$decoded .= substr( $encoded, $was_at );
293+
294+
restore_error_handler();
295+
296+
return $decoded;
297+
}
298+
299+
/**
300+
* Determines if a span of text represents only linear white space.
301+
*
302+
* @since {WP_VERSION}
303+
* @access private
304+
*
305+
* @param string $string
306+
* @param int $start
307+
* @param int $length
308+
* @return bool
309+
*/
310+
function rfc2047_only_LWS( $string, $start, $length ) {
311+
$at = $start;
312+
$end = $start + $length;
313+
$one = false;
314+
315+
while ( $at < $end ) {
316+
$had_crlf = false;
317+
318+
// Advance past one optional CRLF.
319+
if ( $at + 1 < $end && "\r" === $string[ $at ] && "\n" === $string[ $at + 1 ] ) {
320+
$had_crlf = true;
321+
$at += 2;
322+
}
323+
324+
// Advance past any SPACE / HTAB
325+
$horizontal_spaces = strspn( $string, " \t", $at, $end - $at );
326+
327+
if ( 0 === $horizontal_spaces ) {
328+
return ! $had_crlf && $one && $at === $end;
329+
}
330+
331+
$one = true;
332+
$at += $horizontal_spaces;
333+
}
334+
335+
return $one;
336+
}

src/wp-mail.php

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,7 @@
124124
$subject = trim( $line );
125125
$subject = substr( $subject, 9, strlen( $subject ) - 9 );
126126
// Captures any text in the subject before $phone_delim as the subject.
127-
if ( function_exists( 'iconv_mime_decode' ) ) {
128-
$subject = iconv_mime_decode( $subject, 2, get_option( 'blog_charset' ) );
129-
} else {
130-
$subject = wp_iso_descrambler( $subject );
131-
}
127+
$subject = wp_decode_rfc2047( $subject );
132128
$subject = explode( $phone_delim, $subject );
133129
$subject = $subject[0];
134130
}

src/wp-settings.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
// Load early WordPress files.
112112
require ABSPATH . WPINC . '/class-wp-list-util.php';
113113
require ABSPATH . WPINC . '/class-wp-token-map.php';
114+
require ABSPATH . WPINC . '/rfc2047-mime.php';
114115
require ABSPATH . WPINC . '/utf8.php';
115116
require ABSPATH . WPINC . '/formatting.php';
116117
require ABSPATH . WPINC . '/meta.php';

0 commit comments

Comments
 (0)