Skip to content

Commit 75854fd

Browse files
dmsnellsirreal
andcommitted
MIME: Introduce MIME type parser (#10640)
Various applications require making decisions based on a MIME type from an HTTP header, such as is found in the “Content-Type” header. This patch introduces a new utility class for parsing MIME media types compliant with the WHATWG MIME Sniffing algorithm. Co-authored-by: Jon Surrell <[email protected]>
1 parent 2ae6561 commit 75854fd

File tree

1 file changed

+366
-0
lines changed

1 file changed

+366
-0
lines changed
Lines changed: 366 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
<?php
2+
3+
/**
4+
* Core class for working with MIME type strings.
5+
*
6+
* Example:
7+
*
8+
* $mime_type = WP_Mime_Type::from_string( $headers['content-type'] );
9+
* if ( isset( $mime_type ) && $mime_type->is_json() ) {
10+
* echo '<script type="application/json">';
11+
* }
12+
*
13+
* $mime_type = WP_Mime_Type::from_string( 'text/HTML ; charset=utf8' );
14+
* 'text/html;charset=utf8' === $mime_type->serialize();
15+
*
16+
* @todo Many of these feel like they would be better as static methods taking strings, but also there is
17+
* some non-trivial parsing involved, so those static methods should still parse unless we duplicate
18+
* all of the parsing logic.
19+
* @todo Should the `supplied_type` be stored for further raw analysis?
20+
*
21+
* @see https://mimesniff.spec.whatwg.org/
22+
* @see https://www.rfc-editor.org/rfc/rfc2045#section-5.1
23+
* @see https://www.rfc-editor.org/rfc/rfc9110#name-media-type
24+
* @see https://www.iana.org/assignments/media-types/media-types.xhtml
25+
*
26+
* @since {WP_VERSION}
27+
*/
28+
class WP_Mime_Type {
29+
const TOKEN_CODE_POINTS = "!#$%&'*+-.0123456789^_`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz|~";
30+
31+
/*
32+
* > An HTTP quoted-string token code point is U+0009 TAB, a code point in the range U+0020 SPACE to U+007E (~),
33+
* > inclusive, or a code point in the range U+0080 through U+00FF (ÿ), inclusive.
34+
*
35+
* This list includes the inverse set of the above.
36+
*
37+
* @since {WP_VERSION}
38+
*/
39+
const QUOTED_STRING_FORBIDDEN = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F";
40+
41+
/**
42+
* @since {WP_VERSION}
43+
*
44+
* @var string
45+
*/
46+
protected $type;
47+
48+
/**
49+
* @since {WP_VERSION}
50+
*
51+
* @var string
52+
*/
53+
protected $subtype;
54+
55+
/**
56+
* @since {WP_VERSION}
57+
*
58+
* @var Array<string, string>
59+
*/
60+
protected $parameters = array();
61+
62+
/**
63+
* @todo Not sure what to do with this, or if it’s necessary. It could be masquerading HTML content types.
64+
*
65+
* > some older installations of Apache contain a bug that causes them to supply one
66+
* > of these Content-Type headers when serving files with unrecognized MIME types
67+
*
68+
* @see https://web.archive.org/web/20250511162054/https://bz.apache.org/bugzilla/show_bug.cgi?id=13986
69+
*
70+
* @since {WP_VERSION}
71+
*
72+
* @var bool
73+
*/
74+
protected $detected_apache_bug = false;
75+
76+
/**
77+
* @todo Rename to `WP_Mime_Type::sniff()`?
78+
*/
79+
public static function from_string( string $supplied_type ): ?self {
80+
// 1. Remove any leading and trailing HTTP whitespace from input.
81+
$input = trim( $supplied_type, " \t\r\n" );
82+
83+
// 2. Let position be a position variable for input, initially pointing at the start of input.
84+
$position = 0;
85+
$end = strlen( $input );
86+
87+
// 3. Let type be the result of collecting a sequence of code points that are not U+002F (/) from input, given position.
88+
$type_start = $position;
89+
$type_length = strcspn( $input, '/', $type_start );
90+
$type = substr( $input, $type_start, $type_length );
91+
92+
// 4. If type is the empty string or does not solely contain HTTP token code points, then return failure.
93+
// 5. If position is past the end of the input, then return failure.
94+
if (
95+
'' === $type ||
96+
( $position + $type_length >= $end ) ||
97+
strspn( $type, self::TOKEN_CODE_POINTS ) !== strlen( $type )
98+
) {
99+
return null;
100+
}
101+
102+
// 6. Advance position by 1. (This skips past U+002F (/).)
103+
$position = $type_start + $type_length + 1;
104+
105+
// 7. Let subtype be the result of collecting a sequence of code points that are not U+003B (;) from input, given position.
106+
$subtype_start = $position;
107+
$subtype_length = strcspn( $input, ';', $subtype_start );
108+
109+
// 8. Remove any trailing HTTP whitespace from subtype.
110+
$subtype = substr( $input, $subtype_start, $subtype_length );
111+
$subtype = rtrim( $subtype, " \t\r\n" );
112+
113+
// 9. If subtype is the empty string or does not solely contain HTTP token code points, then return failure.
114+
if ( '' === $subtype || strspn( $subtype, self::TOKEN_CODE_POINTS ) !== strlen( $subtype ) ) {
115+
return null;
116+
}
117+
118+
// 10. Let mimeType be a new MIME type record whose type is type, in ASCII lowercase, and subtype is subtype, in ASCII lowercase.
119+
$self = new self();
120+
$self->type = strtolower( $type );
121+
$self->subtype = strtolower( $subtype );
122+
123+
// 11. While position is not past the end of input:
124+
$position = $subtype_start + $subtype_length;
125+
while ( $position < $end ) {
126+
// 1. Advance position by 1. (This skips past U+003B (;).)
127+
++$position;
128+
129+
// 2. Collect a sequence of code points that are HTTP whitespace from input given position.
130+
$position += strspn( $input, " \t\r\n", $position );
131+
132+
// 3. Let parameterName be the result of collecting a sequence of code points that are not U+003B (;) or U+003D (=) from input, given position.
133+
$parameter_start = $position;
134+
$parameter_length = strcspn( $input, ';=', $parameter_start );
135+
136+
// 4. Set parameterName to parameterName, in ASCII lowercase.
137+
$parameter_name = strtolower( substr( $input, $parameter_start, $parameter_length ) );
138+
139+
// 5. If position is not past the end of input, then:
140+
$position = $parameter_start + $parameter_length;
141+
if ( $position < $end ) {
142+
// 1. If the code point at position within input is U+003B (;), then continue.
143+
if ( ';' === $input[ $position ] ) {
144+
continue;
145+
}
146+
147+
// 2. Advance position by 1. (This skips past U+003D (=).)
148+
++$position;
149+
}
150+
151+
// 6. If position is past the end of input, then break.
152+
if ( $position >= $end ) {
153+
break;
154+
}
155+
156+
/*
157+
* 7. Let parameterValue be null.
158+
* 8. If the code point at position within input is U+0022 ("), then:
159+
*/
160+
if ( '"' === $input[ $position ] ) {
161+
// 1. Set parameterValue to the result of collecting an HTTP quoted string from input, given position and true.
162+
$value_start = $position + 1;
163+
$value_length = strcspn( $input, '"', $value_start );
164+
$value = substr( $input, $value_start, $value_length );
165+
$value = strtr( $value, array( "\x5C" => '' ) );
166+
167+
if ( $value_length > 0 && "\x5C" === $input[ $value_start + $value_length - 1 ] ) {
168+
$value .= "\x5C";
169+
}
170+
171+
$position = $value_start + $value_length;
172+
$position .= strcspn( $input, ';', $position );
173+
} else { // 9. Otherwise:
174+
// 1. Set parameterValue to the result of collecting a sequence of code points that are not U+003B (;) from input, given position.
175+
$value_start = $position;
176+
$value_length = strcspn( $input, ';', $value_start );
177+
$position = $value_start + $value_length;
178+
179+
// 2. Remove any trailing HTTP whitespace from parameterValue.
180+
$value = rtrim( substr( $input, $value_start, $value_length ), " \t\r\n" );
181+
182+
// 3. If parameterValue is the empty string, then continue.
183+
if ( '' === $value ) {
184+
continue;
185+
}
186+
}
187+
188+
// 10. If all of the following are true…then set mimeType’s parameters[parameterName] to parameterValue.
189+
if (
190+
'' !== $parameter_name &&
191+
strspn( $parameter_name, self::TOKEN_CODE_POINTS ) === strlen( $parameter_name ) &&
192+
strcspn( $value, self::QUOTED_STRING_FORBIDDEN ) === strlen( $value ) &&
193+
! isset( $self->parameters[ $parameter_name ] )
194+
) {
195+
$self->parameters[ $parameter_name ] = $value;
196+
}
197+
}
198+
199+
$self->detected_apache_bug = (
200+
'text' === $type &&
201+
in_array(
202+
$supplied_type,
203+
array(
204+
'text/plain',
205+
'text/plain; charset=ISO-8859-1',
206+
'text/plain; charset=iso-8859-1',
207+
'text/plain; charset=UTF-8'
208+
),
209+
true
210+
)
211+
);
212+
213+
return $self;
214+
}
215+
216+
public function serialize(): string {
217+
$serialization = $this->essence();
218+
219+
foreach ( $this->parameters as $name => $value ) {
220+
$serialized_value = $value;
221+
if (
222+
'' === $value ||
223+
strspn( $value, self::TOKEN_CODE_POINTS ) !== strlen( $value )
224+
) {
225+
$serialized_value = strtr(
226+
$serialized_value,
227+
array(
228+
'"' => '\"',
229+
"\x5C" => "\x5C\x5C",
230+
)
231+
);
232+
$serialized_value = "\"{$serialized_value}\"";
233+
};
234+
235+
$serialization .= ";{$name}={$serialized_value}";
236+
}
237+
238+
return $serialization;
239+
}
240+
241+
public function essence(): string {
242+
return "{$this->type}/{$this->subtype}";
243+
}
244+
245+
public function indicated_charset(): ?string {
246+
return $this->parameters['charset'] ?? null;
247+
}
248+
249+
public function is_archive(): bool {
250+
return (
251+
'application' === $this->type &&
252+
in_array(
253+
$this->subtype,
254+
array(
255+
'x-rar-compressed',
256+
'zip',
257+
'x-gzip',
258+
),
259+
true
260+
)
261+
);
262+
}
263+
264+
public function is_font(): bool {
265+
return (
266+
'application' === $this->type &&
267+
in_array(
268+
$this->subtype,
269+
array(
270+
'font-cff',
271+
'font-otf',
272+
'font-sfnt',
273+
'font-ttf',
274+
'font-woff',
275+
'vnd.ms-fontobject',
276+
'vnd.ms-opentype',
277+
),
278+
true
279+
)
280+
);
281+
}
282+
283+
public function is_html(): bool {
284+
return 'text' === $this->type && 'html' === $this->subtype;
285+
}
286+
287+
public function is_image(): bool {
288+
return 'image' === $this->type;
289+
}
290+
291+
public function is_javascript(): bool {
292+
if ( 'application' === $this->type ) {
293+
return in_array(
294+
$this->subtype,
295+
array(
296+
'ecmascript',
297+
'javascript',
298+
'x-ecmascript',
299+
'x-javascript',
300+
),
301+
true
302+
);
303+
}
304+
305+
if ( 'text' === $this->type ) {
306+
return in_array(
307+
$this->subtype,
308+
array(
309+
'ecmascript',
310+
'javascript',
311+
'javascript1.0',
312+
'javascript1.1',
313+
'javascript1.2',
314+
'javascript1.3',
315+
'javascript1.4',
316+
'javascript1.5',
317+
'jscript',
318+
'livescript',
319+
'x-ecmascript',
320+
'x-javascript',
321+
),
322+
true
323+
);
324+
}
325+
326+
return false;
327+
}
328+
329+
public function is_json(): bool {
330+
return (
331+
( 'application' === $this->type && 'json' === $this->subtype ) ||
332+
( 'text' === $this->type && 'json' === $this->subtype ) ||
333+
str_ends_with( $this->subtype, '+json' )
334+
);
335+
}
336+
337+
public function is_media(): bool {
338+
return (
339+
'audio' === $this->type ||
340+
'video' === $this->type ||
341+
( 'application' === $this->type && 'ogg' === $this->subtype )
342+
);
343+
}
344+
345+
public function is_scriptable(): bool {
346+
return (
347+
$this->is_xml() ||
348+
$this->is_html() ||
349+
( 'application' === $this->type && 'pdf' === $this->subtype )
350+
);
351+
}
352+
353+
public function is_xml(): bool {
354+
return (
355+
( 'xml' === $this->subtype && in_array( $this->type, array( 'text', 'application' ), true ) ) ||
356+
str_ends_with( $this->subtype, '+xml' )
357+
);
358+
}
359+
360+
public function is_zip(): bool {
361+
return (
362+
( 'application' === $this->type && 'zip' === $this->subtype ) ||
363+
str_ends_with( $this->subtype, '+zip' )
364+
);
365+
}
366+
}

0 commit comments

Comments
 (0)