1+ <?php
2+
3+ /**
4+ * Implements the decoder from RFC 2047:
5+ * MIME Part 3: Message Header Extensions for Non-ASCII Text.
6+ *
7+ * This module contains decoding functions for supported MIME
8+ * encodings as are used with email servers which don’t support
9+ * or haven’t activated UTF-8 support.
10+ *
11+ * @see https://www.rfc-editor.org/rfc/rfc2047
12+ *
13+ * @package WordPress
14+ * @subpackage rfc2044
15+ */
16+
17+ /**
18+ * Decodes text potentially containing RFC2047 MIME encoded words.
19+ * Returns decoded text as UTF-8, if supported, else `null`.
20+ *
21+ * Example:
22+ *
23+ * // Quoted forms have non-printable ASCII encoded as octets.
24+ * 'this is some text' === wp_decode_rfc2047( '=?iso-8859-1?q?this=20is_some=20text?=' );
25+ * '👌' === wp_decode_rfc2047( '=?utf-8?q?=F0=9F=91=8C?=' );
26+ *
27+ * // Binary forms are base64-encoded.
28+ * '👌' === wp_decode_rfc2047( '=?utf-8?B??=8J+RjA==?=' );
29+ * 'םולש ןב ילטפנ' === wp_decode_rfc2047( '=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=' );
30+ *
31+ * // Character sets are re-encoded into UTF-8
32+ * '100¥' === wp_decode_rfc2047( '=?iso-8859-1?Q?500=A5?=' );
33+ * '🏴' === wp_decode_rfc2047( '=?GB-18030?Q?=949=C82=D36=A01=D36=9F6=D36=9F9=D36=A08=D36=A01=D36=A25?=' );
34+ *
35+ * // Linear white-space is collapsed.
36+ * 'ab c d e' === wp_decode_rfc2047( '=?ASCII?Q?a?= =?ASCII?Q?b?= c d=?ASCII?Q?=20?==?ASCII?Q?e?=' )
37+ *
38+ * // Error-handling is up to the call site.
39+ * '=?UTF-8?Q?=6f?=' === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=' );
40+ * '=?UTF-8?Q?=6f?=' === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=', 'preserve-errors' );
41+ * '�' === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=', 'replace-errors' );
42+ * null === wp_decode_rfc2047( '=?UTF-8?Q?=6f?=', 'bail-on-error' );
43+ *
44+ * // Invalid character encodings are errors.
45+ * null === wp_decode_rfc2047( '=?UTF-8?Q?=C0?=', 'bail-on-error' );
46+ *
47+ * @see https://www.rfc-editor.org/rfc/rfc2047
48+ *
49+ * @since {WP_VERSION}
50+ *
51+ * @param string $encoded US-ASCII text potentially containing MIME encoded words.
52+ * @param ?('preserve-errors'|'replace-errors'|'bail-on-error') $errors Optional. How to handle invalid encoded words.
53+ * Default is to preserve invalid encoded words as plaintext.
54+ * @return string Decoded string in UTF-8, if supported, else `null`.
55+ */
56+ function wp_decode_rfc2047 ( $ encoded , $ errors = 'preserve-errors ' ) {
57+ /**
58+ * {@see iconv_mime_decode()} which does not give control over error-handling
59+ * at the granularity necessary for this decoder.
60+ */
61+
62+ $ decoded = '' ;
63+ $ end = strlen ( $ encoded );
64+ $ at = 0 ;
65+ $ was_at = 0 ;
66+
67+ set_error_handler (
68+ static function ( $ errno , $ errstr ) {
69+ if (
70+ str_starts_with ( $ errstr , 'mb_convert_encoding(): ' ) ||
71+ str_starts_with ( $ errstr , 'iconv(): ' )
72+ ) {
73+ throw new Error ( $ errstr );
74+ }
75+
76+ return false ;
77+ },
78+ E_WARNING
79+ );
80+
81+ while ( $ at < $ end ) {
82+ $ encoded_word_at = strpos ( $ encoded , '=? ' , $ at );
83+ if ( $ encoded_word_at === false ) {
84+ break ;
85+ }
86+
87+ /*
88+ * > charset = token
89+ * > token = 1*<Any CHAR except SPACE, CTLs, and especials>
90+ * > especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / <"> / "/" / "[" / "]" / "?" / "." / "="
91+ * > CHAR = %00–%7F
92+ * > CTL = %00–%1F
93+ * > SPACE = %20
94+ */
95+ $ charset_at = $ encoded_word_at + 2 ;
96+ $ charset_length = strspn ( $ encoded , "!#$%&'*+-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyz{|}~ " , $ charset_at );
97+ if ( $ charset_length < 1 ) {
98+ $ at = $ charset_at ;
99+ continue ;
100+ }
101+
102+ $ after_charset = $ charset_at + $ charset_length ;
103+ if ( $ after_charset >= $ end || '? ' !== $ encoded [ $ after_charset ] ) {
104+ $ at = $ after_charset ;
105+ continue ;
106+ }
107+
108+ $ encoding_at = $ after_charset + 1 ;
109+ $ encoding_length = strspn ( $ encoded , "!#$%&'*+-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyz{|}~ " , $ encoding_at );
110+ if ( $ encoding_length < 1 ) {
111+ $ at = $ encoding_at ;
112+ continue ;
113+ }
114+
115+ $ after_encoding = $ encoding_at + $ encoding_length ;
116+ if ( $ after_encoding >= $ end || '? ' !== $ encoded [ $ after_encoding ] ) {
117+ $ at = $ after_encoding ;
118+ continue ;
119+ }
120+
121+ // > encoded-text = 1*<Any printable ASCII character other than "?" or SPACE>
122+ $ chunk_at = $ after_encoding + 1 ;
123+ $ chunk_length = strspn ( $ encoded , "! \"#$%&'()*+,-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ " , $ chunk_at );
124+ if ( $ chunk_length < 1 ) {
125+ $ at = $ chunk_at ;
126+ continue ;
127+ }
128+
129+ $ closer_at = $ chunk_at + $ chunk_length ;
130+ if ( $ closer_at >= $ end || '? ' !== $ encoded [ $ closer_at ] || '= ' !== $ encoded [ $ closer_at + 1 ] ) {
131+ $ at = $ closer_at ;
132+ continue ;
133+ }
134+
135+ $ after_encoded_word = $ closer_at + 2 ;
136+ /*
137+ * RFC2047 says the total length MUST be no more than 75 characters,
138+ * but doesn’t indicate resolution when the length is greater than this.
139+ *
140+ * - Should this be treated as unencoded text?
141+ * - Should this be corrupted and rejected?
142+ * - Should this be decoded anyway?
143+ *
144+ * Given that the intent is to fit encoded-words within a single line of
145+ * a header and to ensure parsers need not lookahead too far, this will
146+ * be decoded if possible. The failure was in the encoder, not here.
147+ */
148+
149+ if ( 1 !== $ encoding_length || 1 !== strspn ( $ encoded , 'bBqQ ' , $ encoding_at , 1 ) ) {
150+ goto handle_invalid;
151+ }
152+
153+ /*
154+ * > If the mail reader does not support the character set used, it may
155+ * > (a) display the 'encoded-word' as ordinary text (i.e., as it appears
156+ * > in the header), (b) make a "best effort" to display using such
157+ * > characters as are available, or (c) substitute an appropriate message
158+ * > indicating that the decoded text could not be displayed.
159+ *
160+ * > For the ISO-8859-* character sets, the mail reading program must at
161+ * > least be able to display the characters which are also in the ASCII set.
162+ */
163+
164+ // Shorten the charset to ignore any RFC2184/RC2231 language tag.
165+ $ charset_length = strcspn ( $ encoded , '* ' , $ charset_at , $ charset_length );
166+
167+ /**
168+ * Disregard over-long charset names. This value was chosen by inspecting the
169+ * names returned by {@see mb_convert_encoding()} and {@see mb_encoding_aliases()}.
170+ *
171+ * The goal is to pragmatically balance supporting all possible charsets and
172+ * over-eagerly allocating strings, only to disregard them immediately.
173+ */
174+ if ( $ charset_length > 32 ) {
175+ goto handle_invalid;
176+ }
177+
178+ /*
179+ * Only UTF-8 is supported without conversion mechanisms. When errors are
180+ * preserved, the ISO-8859 family’s ASCII-compatible characters will remain.
181+ */
182+ $ charset = substr ( $ encoded , $ charset_at , $ charset_length );
183+ if (
184+ ! in_array ( strtoupper ( $ charset ), array ( 'ASCII ' , 'US-ASCII ' , 'UTF8 ' , 'UTF-8 ' ), true ) &&
185+ ! function_exists ( 'mb_convert_encoding ' ) &&
186+ ! function_exists ( 'iconv ' )
187+ ) {
188+ goto handle_invalid;
189+ }
190+
191+ /*
192+ * > A mail reader need not attempt to display the text associated with an
193+ * > 'encoded-word' that is incorrectly formed. However, a mail reader
194+ * > MUST NOT prevent the display or handling of a message because an
195+ * > 'encoded-word' is incorrectly formed.
196+ */
197+
198+ $ encoding = $ encoded [ $ encoding_at ];
199+ if ( 'b ' === $ encoding || 'B ' === $ encoding ) {
200+ $ decoded_chunk = base64_decode ( substr ( $ encoded , $ chunk_at , $ chunk_length ), false );
201+ if ( false === $ decoded_chunk ) {
202+ goto handle_invalid;
203+ }
204+ } else {
205+ // @todo There is no error-handling indication here for the Q decoding.
206+ $ failed_decode = false ;
207+ $ decoded_chunk = substr ( $ encoded , $ chunk_at , $ chunk_length );
208+ $ decoded_chunk = strtr ( $ decoded_chunk , '_ ' , ' ' );
209+ $ decoded_chunk = preg_replace_callback (
210+ '/=[0-9A-F]{2}|=/ ' , // Lower-case are not allowed.
211+ function ( $ matches ) use ( &$ failed_decode ) {
212+ if ( '= ' === $ matches [0 ] ) {
213+ $ failed_decode = true ;
214+ return $ matches [0 ];
215+ }
216+ return hex2bin ( substr ( $ matches [0 ], 1 , 2 ) );
217+ },
218+ $ decoded_chunk
219+ );
220+
221+ if ( $ failed_decode ) {
222+ goto handle_invalid;
223+ }
224+ }
225+
226+ // Re-encode into UTF-8.
227+ if ( in_array ( strtoupper ( $ charset ), array ( 'ASCII ' , 'US-ASCII ' , 'UTF8 ' , 'UTF-8 ' ), true ) ) {
228+ // Skip re-encoding for this one.
229+ } elseif ( function_exists ( 'mb_convert_encoding ' ) ) {
230+ try {
231+ $ decoded_chunk = mb_convert_encoding ( $ decoded_chunk , 'UTF-8 ' , $ charset );
232+ } catch ( \Throwable $ exception ) {
233+ goto handle_invalid;
234+ }
235+ } elseif ( function_exists ( 'iconv ' ) ) {
236+ $ decoded_chunk = iconv ( $ charset , 'UTF-8 ' , $ decoded_chunk );
237+ }
238+
239+ // Verify the encoding.
240+ if ( false === $ decoded_chunk || ! wp_is_valid_utf8 ( $ decoded_chunk ) ) {
241+ goto handle_invalid;
242+ }
243+
244+ // Append the decoded chunk.
245+ $ prefix_length = $ encoded_word_at - $ was_at ;
246+ if ( $ prefix_length === 0 || rfc2047_only_LWS ( $ encoded , $ was_at , $ prefix_length ) ) {
247+ $ decoded .= $ decoded_chunk ;
248+ } else {
249+ $ prefix = substr ( $ encoded , $ was_at , $ prefix_length );
250+ $ decoded .= "{$ prefix }{$ decoded_chunk }" ;
251+ }
252+ $ was_at = $ after_encoded_word ;
253+ $ at = $ was_at ;
254+ continue ;
255+
256+ handle_invalid:
257+ $ at = $ after_encoded_word ;
258+ switch ( $ errors ) {
259+ case 'bail-on-error ' :
260+ restore_error_handler ();
261+ return null ;
262+
263+ case 'preserve-errors ' :
264+ break ;
265+
266+ case 'replace-errors ' :
267+ $ prefix_length = $ encoded_word_at - $ was_at ;
268+ if ( $ prefix_length === 0 || rfc2047_only_LWS ( $ encoded , $ was_at , $ prefix_length ) ) {
269+ $ decoded .= "\u{FFFD}" ;
270+ } else {
271+ $ prefix = substr ( $ encoded , $ was_at , $ prefix_length );
272+ $ decoded .= "{$ prefix }\u{FFFD}" ;
273+ }
274+ $ was_at = $ after_encoded_word ;
275+ break ;
276+
277+ default :
278+ _doing_it_wrong (
279+ __FUNCTION__ ,
280+ "Use only one of 'preserve-errors' or 'replace-errors' for error-handling. " ,
281+ '{WP_VERSION} '
282+ );
283+ restore_error_handler ();
284+ return null ;
285+ }
286+ }
287+
288+ if ( $ at === 0 ) {
289+ return $ encoded ;
290+ }
291+
292+ $ decoded .= substr ( $ encoded , $ was_at );
293+
294+ restore_error_handler ();
295+
296+ return $ decoded ;
297+ }
298+
299+ /**
300+ * Determines if a span of text represents only linear white space.
301+ *
302+ * @since {WP_VERSION}
303+ * @access private
304+ *
305+ * @param string $string
306+ * @param int $start
307+ * @param int $length
308+ * @return bool
309+ */
310+ function rfc2047_only_LWS ( $ string , $ start , $ length ) {
311+ $ at = $ start ;
312+ $ end = $ start + $ length ;
313+ $ one = false ;
314+
315+ while ( $ at < $ end ) {
316+ $ had_crlf = false ;
317+
318+ // Advance past one optional CRLF.
319+ if ( $ at + 1 < $ end && "\r" === $ string [ $ at ] && "\n" === $ string [ $ at + 1 ] ) {
320+ $ had_crlf = true ;
321+ $ at += 2 ;
322+ }
323+
324+ // Advance past any SPACE / HTAB
325+ $ horizontal_spaces = strspn ( $ string , " \t" , $ at , $ end - $ at );
326+
327+ if ( 0 === $ horizontal_spaces ) {
328+ return ! $ had_crlf && $ one && $ at === $ end ;
329+ }
330+
331+ $ one = true ;
332+ $ at += $ horizontal_spaces ;
333+ }
334+
335+ return $ one ;
336+ }
0 commit comments