11<?php
22
33/**
4- * Core class for working with MIME type strings.
4+ * Core class for sniffing MIME types from various sources in standardized and secure ways.
5+ *
6+ * This class exists to harmonize content-type detection between the server and browsers.
7+ * See the following introduction from the WHATWG MIME Sniffing specification:
8+ *
9+ * > The HTTP Content-Type header field is intended to indicate the MIME type of an HTTP response.
10+ * > However, many HTTP servers supply a Content-Type header field value that does not match the
11+ * > actual contents of the response. Historically, web browsers have tolerated these servers by
12+ * > examining the content of HTTP responses in addition to the Content-Type header field in order
13+ * > to determine the effective MIME type of the response.
14+ * >
15+ * > Without a clear specification for how to "sniff" the MIME type, each user agent has been
16+ * > forced to reverse-engineer the algorithms of other user agents in order to maintain
17+ * > interoperability. Inevitably, these efforts have not been entirely successful, resulting
18+ * > in divergent behaviors among user agents. In some cases, these divergent behaviors have
19+ * > had security implications, as a user agent could interpret an HTTP response as a different
20+ * > MIME type than the server intended.
21+ * >
22+ * > These security issues are most severe when an "honest" server allows potentially malicious
23+ * > users to upload their own files and then serves the contents of those files with a low-privilege
24+ * > MIME type. For example, if a server believes that the client will treat a contributed file as an
25+ * > image (and thus treat it as benign), but a user agent believes the content to be HTML (and thus
26+ * > privileged to execute any scripts contained therein), an attacker might be able to steal the
27+ * > user’s authentication credentials and mount other cross-site scripting attacks. (Malicious
28+ * > servers, of course, can specify an arbitrary MIME type in the Content-Type header field.)
29+ * >
30+ * > This document describes a content sniffing algorithm that carefully balances the compatibility
31+ * > needs of user agent with the security constraints imposed by existing web content. The algorithm
32+ * > originated from research conducted by Adam Barth, Juan Caballero, and Dawn Song, based on content
33+ * > sniffing algorithms present in popular user agents, an extensive database of existing web content,
34+ * > and metrics collected from implementations deployed to a sizable number of users.
35+ * >
36+ * > - https://mimesniff.spec.whatwg.org/#introduction
37+ *
38+ * Some MIME types are inferred from string sources, such as HTTP headers and HTML meta values. These
39+ * are usually intentional declarations of a MIME type, and while not always accurate, they are meant
40+ * to explicitly convey content types.
541 *
642 * Example:
743 *
8- * $mime_type = WP_Mime_Type::from_string ( $headers['content-type'] );
44+ * $mime_type = WP_Mime_Sniffer::from_declaration ( $headers['content-type'] );
945 * if ( isset( $mime_type ) && $mime_type->is_json() ) {
1046 * echo '<script type="application/json">';
1147 * }
1248 *
13- * $mime_type = WP_Mime_Type::from_string ( 'text/HTML ; charset=utf8' );
49+ * $mime_type = WP_Mime_Sniffer::from_declaration ( 'text/HTML ; charset=utf8' );
1450 * 'text/html;charset=utf8' === $mime_type->serialize();
1551 *
16- * @todo Many of these feel like they would be better as static methods taking strings, but also there is
17- * some non-trivial parsing involved, so those static methods should still parse unless we duplicate
18- * all of the parsing logic.
19- * @todo Should the `supplied_type` be stored for further raw analysis?
52+ * In other cases the MIME types are inferred from _binary_ data, which may pose a higher security
53+ * risk due to the complexity of binary decoders. While strings and binary data in PHP are both
54+ * stored identically in a `string` type, the binary sniffing expects non-human-readable inputs,
55+ * like media files or archives, and operates on the data from inside the file.
56+ *
57+ * Example:
58+ *
59+ * $mime_type = WP_Mime_Sniffer::from_binary_file_contents( $uploaded_file_data );
60+ *
61+ * It is not necessary to read the file contents before sniffing the MIME type, however. It may
62+ * be preferable to pass a file path, in which case this class will only read as many bytes as
63+ * are necessary to perform the sniff. This can prevent out-of-memory crashing when working with
64+ * large files, such as video content.
65+ *
66+ * Example:
67+ *
68+ * $mime_type = WP_Mime_Sniffer::from_file( $tempfile );
2069 *
2170 * @see https://mimesniff.spec.whatwg.org/
2271 * @see https://www.rfc-editor.org/rfc/rfc2045#section-5.1
2372 * @see https://www.rfc-editor.org/rfc/rfc9110#name-media-type
2473 * @see https://www.iana.org/assignments/media-types/media-types.xhtml
2574 *
26- * @since {WP_VERSION}
75+ * @since 7.0.0
2776 */
28- class WP_Mime_Type {
29- const BINARY_BYTES = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1C\x1D\x1E\x1F" ;
30-
31- const TOKEN_CODE_POINTS = "!#$%&'*+-.0123456789^_`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz|~ " ;
32-
33- /*
34- * > An HTTP quoted-string token code point is U+0009 TAB, a code point in the range U+0020 SPACE to U+007E (~),
35- * > inclusive, or a code point in the range U+0080 through U+00FF (ÿ), inclusive.
36- *
37- * This list includes the inverse set of the above.
38- *
39- * @since {WP_VERSION}
40- */
41- const QUOTED_STRING_FORBIDDEN = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F" ;
42-
77+ class WP_Mime_Sniffer {
4378 /**
44- * @since {WP_VERSION}
79+ * @since 7.0.0
4580 *
4681 * @var string
4782 */
4883 protected $ type ;
4984
5085 /**
51- * @since {WP_VERSION}
86+ * @since 7.0.0
5287 *
5388 * @var string
5489 */
5590 protected $ subtype ;
5691
5792 /**
58- * @since {WP_VERSION}
93+ * @since 7.0.0
5994 *
6095 * @var Array<string, string>
6196 */
6297 protected $ parameters = array ();
6398
99+ /**
100+ * @since 7.0.0
101+ *
102+ * @var string|null
103+ */
104+ private static $ last_error = null ;
105+
64106 /**
65107 * @todo Not sure what to do with this, or if it’s necessary. It could be masquerading HTML content types.
66108 *
@@ -69,7 +111,7 @@ class WP_Mime_Type {
69111 *
70112 * @see https://web.archive.org/web/20250511162054/https://bz.apache.org/bugzilla/show_bug.cgi?id=13986
71113 *
72- * @since {WP_VERSION}
114+ * @since 7.0.0
73115 *
74116 * @var bool
75117 */
@@ -86,11 +128,28 @@ public function __construct( string $type, string $subtype, ?array $parameters =
86128 }
87129
88130 /**
89- * @todo Rename to `WP_Mime_Type::sniff()`?
131+ * Parses a supplied MIME type declaration, if valid, otherwise returns `null`.
132+ *
133+ * Example:
134+ *
135+ * $mime_type = WP_Mime_Sniffer::from_declaration( 'text/html; charset=utf8' );
136+ * true === $mime_type->is_html();
137+ * 'utf8' === $mime_type->indicated_charset();
138+ *
139+ * null === WP_Mime_Sniffer::from_declaration( 'html' );
140+ *
141+ * $mime_type = WP_Mime_Sniffer::from_declaration( 'text/json' );
142+ * true === $mime_type->is_json();
143+ * 'application/json' === $mime_type->essence();
144+ *
145+ * @since 7.0.0
146+ *
147+ * @param string $supplied_type Provided text of MIME type, e.g. from HTTP "Content-Type" header.
148+ * @return self|null MIME type instance if valid, otherwise `null`.
90149 */
91- public static function from_string ( string $ supplied_type ): ?self {
150+ public static function from_declaration ( string $ supplied_type ): ?self {
92151 // 1. Remove any leading and trailing HTTP whitespace from input.
93- $ input = trim ( $ supplied_type , " \t\r\n" );
152+ $ input = trim ( $ supplied_type , self :: HTTP_WHITESPACE );
94153
95154 // 2. Let position be a position variable for input, initially pointing at the start of input.
96155 $ position = 0 ;
@@ -120,7 +179,7 @@ public static function from_string( string $supplied_type ): ?self {
120179
121180 // 8. Remove any trailing HTTP whitespace from subtype.
122181 $ subtype = substr ( $ input , $ subtype_start , $ subtype_length );
123- $ subtype = rtrim ( $ subtype , " \t\r\n" );
182+ $ subtype = rtrim ( $ subtype , self :: HTTP_WHITESPACE );
124183
125184 // 9. If subtype is the empty string or does not solely contain HTTP token code points, then return failure.
126185 if ( '' === $ subtype || strspn ( $ subtype , self ::TOKEN_CODE_POINTS ) !== strlen ( $ subtype ) ) {
@@ -137,7 +196,7 @@ public static function from_string( string $supplied_type ): ?self {
137196 ++$ position ;
138197
139198 // 2. Collect a sequence of code points that are HTTP whitespace from input given position.
140- $ position += strspn ( $ input , " \t\r\n" , $ position );
199+ $ position += strspn ( $ input , self :: HTTP_WHITESPACE , $ position );
141200
142201 // 3. Let parameterName be the result of collecting a sequence of code points that are not U+003B (;) or U+003D (=) from input, given position.
143202 $ parameter_start = $ position ;
@@ -187,7 +246,7 @@ public static function from_string( string $supplied_type ): ?self {
187246 $ position = $ value_start + $ value_length ;
188247
189248 // 2. Remove any trailing HTTP whitespace from parameterValue.
190- $ value = rtrim ( substr ( $ input , $ value_start , $ value_length ), " \t\r\n" );
249+ $ value = rtrim ( substr ( $ input , $ value_start , $ value_length ), self :: HTTP_WHITESPACE );
191250
192251 // 3. If parameterValue is the empty string, then continue.
193252 if ( '' === $ value ) {
@@ -223,9 +282,36 @@ public static function from_string( string $supplied_type ): ?self {
223282 return $ self ;
224283 }
225284
226- public static function from_binary ( string $ resource_header ): ?self {
227- if ( strlen ( $ resource_header ) === 1455 ) {
228- $ resource_header = substr ( $ resource_header , 0 , 1455 );
285+ public static function from_file ( string $ file_path ): ?self {
286+ $ is_file_scheme = 0 === substr_compare ( $ file_path , 'file:// ' , 0 , 7 , false );
287+ $ filename = $ is_file_scheme ? $ file_path : "file:// {$ file_path }" ;
288+
289+ $ handle = fopen ( $ filename , 'rb ' );
290+ if ( false === $ handle ) {
291+ self ::$ last_error = 'File not found. ' ;
292+ return null ;
293+ }
294+
295+ $ resource_header = fread ( $ handle , 1445 );
296+ if ( false === $ resource_header ) {
297+ self ::$ last_error = 'Could not read file. ' ;
298+ return null ;
299+ }
300+
301+ return self ::from_binary_file_contents ( $ resource_header );
302+ }
303+
304+ /**
305+ * Sniffs a MIME type from the contents of a binary file, if possible, otherwise returns `null`.
306+ *
307+ * @since 7.0.0
308+ *
309+ * @param string $resource_header Contents of a file, of which only a maximum of 1455 bytes will be analyzed.
310+ * @return self|null MIME type instance if detected, otherwise `null`.
311+ */
312+ public static function from_binary_file_contents ( string $ resource_header ): ?self {
313+ if ( strlen ( $ resource_header ) > 1445 ) {
314+ $ resource_header = substr ( $ resource_header , 0 , 1445 );
229315 }
230316
231317 $ length = strlen ( $ resource_header );
@@ -619,13 +705,13 @@ public function is_zip(): bool {
619705 /**
620706 * Returns a parsed MIME media type if the given string represents a JavaScript media type.
621707 *
622- * @since {WP_VERSION}
708+ * @since 7.0.0
623709 *
624710 * @param string $supplied_type
625711 * @return self|null
626712 */
627713 public static function sniff_javascript ( string $ supplied_type ): ?self {
628- $ mime_type = self ::from_string ( $ supplied_type );
714+ $ mime_type = self ::from_declaration ( $ supplied_type );
629715
630716 return isset ( $ mime_type ) && $ mime_type ->is_javascript ()
631717 ? $ mime_type
@@ -635,16 +721,53 @@ public static function sniff_javascript( string $supplied_type ): ?self {
635721 /**
636722 * Returns a parsed MIME media type if the given string represents a JSON media type.
637723 *
638- * @since {WP_VERSION}
724+ * @since 7.0.0
639725 *
640726 * @param string $supplied_type
641727 * @return self|null
642728 */
643729 public static function sniff_json ( string $ supplied_type ): ?self {
644- $ mime_type = self ::from_string ( $ supplied_type );
730+ $ mime_type = self ::from_declaration ( $ supplied_type );
645731
646732 return isset ( $ mime_type ) && $ mime_type ->is_json ()
647733 ? $ mime_type
648734 : null ;
649735 }
736+
737+ /**
738+ * > A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte 0x0B (VT),
739+ * > a byte in the range 0x0E to 0x1A (SO to SUB), or a byte in the range 0x1C to 0x1F (FS to US).
740+ *
741+ * @since 7.0.0
742+ */
743+ const BINARY_BYTES = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1C\x1D\x1E\x1F" ;
744+
745+ /**
746+ * > HTTP whitespace is U+000A LF, U+000D CR, or an HTTP tab or space.
747+ * > An HTTP tab or space is U+0009 TAB or U+0020 SPACE.
748+ *
749+ * @see https://fetch.spec.whatwg.org/#http-whitespace
750+ *
751+ * @since 7.0.0
752+ */
753+ const HTTP_WHITESPACE = " \t\f\r\n" ;
754+
755+ /**
756+ * > An HTTP quoted-string token code point is U+0009 TAB, a code point in the range U+0020 SPACE to U+007E (~),
757+ * > inclusive, or a code point in the range U+0080 through U+00FF (ÿ), inclusive.
758+ *
759+ * This is the inverse set of the above code points for ease of use as a shorter string.
760+ *
761+ * @since 7.0.0
762+ */
763+ const QUOTED_STRING_FORBIDDEN = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F" ;
764+
765+ /**
766+ * > An HTTP token code point is U+0021 (!), U+0023 (#), U+0024 ($), U+0025 (%), U+0026 (&), U+0027 ('),
767+ * > U+002A (*), U+002B (+), U+002D (-), U+002E (.), U+005E (^), U+005F (_), U+0060 (`), U+007C (|),
768+ * > U+007E (~), or an ASCII alphanumeric.
769+ *
770+ * @since 7.0.0
771+ */
772+ const TOKEN_CODE_POINTS = "!#$%&'*+-.0123456789^_`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz|~ " ;
650773}
0 commit comments