Skip to content

Commit 399411b

Browse files
committed
HTML API: Replace PCRE in set_attribute() with new UTF-8 utility.
The HTML API has relied upon a single PCRE to determine whether to allow setting certain attribute names. This was because those names aren’t allowed to contain Unicode noncharacters, but detecting noncharacters without a UTF-8 parser is nontrivial. In this change the direct PCRE has been replaced with a number of `strcpn()` calls and a call to the newer `wp_has_noncharacters()` function. Under the hood, this function will still defer to a PCRE if Unicode support is available, but otherwise will fall back to the UTF-8 pipeline in Core. This change removes the platform variability, making the HTML API more reliable when Unicode support for PCRE is lacking. Developed in #9798 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@61003 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 5347c69 commit 399411b

File tree

2 files changed

+82
-29
lines changed

2 files changed

+82
-29
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3930,41 +3930,32 @@ public function set_attribute( $name, $value ): bool {
39303930
return false;
39313931
}
39323932

3933-
/*
3933+
$name_length = strlen( $name );
3934+
3935+
/**
39343936
* WordPress rejects more characters than are strictly forbidden
39353937
* in HTML5. This is to prevent additional security risks deeper
3936-
* in the WordPress and plugin stack. Specifically the
3937-
* less-than (<) greater-than (>) and ampersand (&) aren't allowed.
3938+
* in the WordPress and plugin stack. Specifically the following
3939+
* are not allowed to be set as part of an HTML attribute name:
39383940
*
3939-
* The use of a PCRE match enables looking for specific Unicode
3940-
* code points without writing a UTF-8 decoder. Whereas scanning
3941-
* for one-byte characters is trivial (with `strcspn`), scanning
3942-
* for the longer byte sequences would be more complicated. Given
3943-
* that this shouldn't be in the hot path for execution, it's a
3944-
* reasonable compromise in efficiency without introducing a
3945-
* noticeable impact on the overall system.
3941+
* - greater-than “>”
3942+
* - ampersand “&”
39463943
*
39473944
* @see https://html.spec.whatwg.org/#attributes-2
3948-
*
3949-
* @todo As the only regex pattern maybe we should take it out?
3950-
* Are Unicode patterns available broadly in Core?
39513945
*/
3952-
if ( preg_match(
3953-
'~[' .
3954-
// Syntax-like characters.
3955-
'"\'>&</ =' .
3956-
// Control characters.
3957-
'\x{00}-\x{1F}' .
3958-
// HTML noncharacters.
3959-
'\x{FDD0}-\x{FDEF}' .
3960-
'\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' .
3961-
'\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' .
3962-
'\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' .
3963-
'\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' .
3964-
'\x{10FFFE}\x{10FFFF}' .
3965-
']~Ssu',
3966-
$name
3967-
) ) {
3946+
if (
3947+
0 === $name_length ||
3948+
// Syntax-like characters.
3949+
strcspn( $name, '"\'>&</ =' ) !== $name_length ||
3950+
// Control characters.
3951+
strcspn(
3952+
$name,
3953+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" .
3954+
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
3955+
) !== $name_length ||
3956+
// Unicode noncharacters.
3957+
wp_has_noncharacters( $name )
3958+
) {
39683959
_doing_it_wrong(
39693960
__METHOD__,
39703961
__( 'Invalid attribute name.' ),

tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,68 @@ public function test_set_attribute_is_case_insensitive() {
311311
$this->assertSame( '<div data-enabled="abc">Test</div>', $processor->get_updated_html(), 'A case-insensitive set_attribute call did not update the existing attribute' );
312312
}
313313

314+
/**
315+
* Ensures that set_attribute doesn’t allow setting an
316+
* attribute with an invalid name and thus break syntax.
317+
*
318+
* @ticket 63863
319+
*
320+
* @expectedIncorrectUsage WP_HTML_Tag_Processor::set_attribute
321+
*
322+
* @dataProvider data_invalid_attribute_names
323+
*
324+
* @param string $invalid_name Invalid attribute name.
325+
*/
326+
public function test_set_attribute_rejects_invalid_names( $invalid_name ) {
327+
$processor = new WP_HTML_Tag_Processor( '<div>' );
328+
$processor->next_tag();
329+
330+
$this->assertFalse(
331+
$processor->set_attribute( $invalid_name, true ),
332+
'Should have rejected invalid attribute name.'
333+
);
334+
}
335+
336+
/**
337+
* Data provider.
338+
*
339+
* @return array[]
340+
*/
341+
public static function data_invalid_attribute_names() {
342+
$invalid_names = array(
343+
'Empty' => array( '' ),
344+
);
345+
346+
// Syntax-like characters.
347+
foreach ( str_split( '"\'>&</ =' ) as $c ) {
348+
$invalid_names[ $c ] = array( "too{$c}late" );
349+
}
350+
351+
// C0 controls.
352+
for ( $i = 0; $i <= 0x1F; $i++ ) {
353+
$c = chr( $i );
354+
$invalid_names[ "C0 Controls: {$i}" ] = array( "shut{$c}down" );
355+
}
356+
357+
// Noncharacters.
358+
for ( $i = 0xFDD0; $i <= 0xFDEF; $i++ ) {
359+
$h = dechex( $i );
360+
$c = mb_chr( $i );
361+
$invalid_names[ "Noncharacter: U+{$h}" ] = array( "shut{$c}down" );
362+
}
363+
364+
for ( $b = 0; $b <= 16; $b++ ) {
365+
for ( $x = 0xFFFE; $x <= 0xFFFF; $x++ ) {
366+
$i = ( $b << 16 ) + $x;
367+
$h = dechex( $i );
368+
$c = mb_chr( $i );
369+
$invalid_names[ "Noncharacter: U+{$h}" ] = array( "shut{$c}down" );
370+
}
371+
}
372+
373+
return $invalid_names;
374+
}
375+
314376
/**
315377
* @ticket 56299
316378
*

0 commit comments

Comments
 (0)