Skip to content

Commit 71a52ce

Browse files
committed
HTML API: Add method to create fragment at node.
HTML Fragment parsing always happens with a context node, which may impact how a fragment of HTML is parsed. HTML Fragment Processors can be instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`. This changeset adds a static method called `create_fragment_at_current_node( string $html_fragment )`. It can only be called when the processor is paused at a `#tag`, with some additional constraints: - The opening and closing tags must appear in the HTML input (no virtual tokens). - No "self-contained" elements are allowed ( `IFRAME`, `SCRIPT`, `TITLE`, etc.). If successful, the method will return a `WP_HTML_Processor` instance whose context is inherited from the node that the method was called from. Props jonsurrell, bernhard-reiter, gziolo. Fixes #62357. git-svn-id: https://develop.svn.wordpress.org/trunk@59444 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 99dd184 commit 71a52ce

File tree

3 files changed

+245
-14
lines changed

3 files changed

+245
-14
lines changed

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,120 @@ function ( WP_HTML_Token $token ): void {
424424
};
425425
}
426426

427+
/**
428+
* Creates a fragment processor at the current node.
429+
*
430+
* HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be
431+
* instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`.
432+
*
433+
* The context node may impact how a fragment of HTML is parsed. For example, consider the HTML
434+
* fragment `<td />Inside TD?</td>`.
435+
*
436+
* A BODY context node will produce the following tree:
437+
*
438+
* └─#text Inside TD?
439+
*
440+
* Notice that the `<td>` tags are completely ignored.
441+
*
442+
* Compare that with an SVG context node that produces the following tree:
443+
*
444+
* ├─svg:td
445+
* └─#text Inside TD?
446+
*
447+
* Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected.
448+
* This is a peculiarity of parsing HTML in foreign content like SVG.
449+
*
450+
* Finally, consider the tree produced with a TABLE context node:
451+
*
452+
* └─TBODY
453+
* └─TR
454+
* └─TD
455+
* └─#text Inside TD?
456+
*
457+
* These examples demonstrate how important the context node may be when processing an HTML
458+
* fragment. Special care must be taken when processing fragments that are expected to appear
459+
* in specific contexts. SVG and TABLE are good examples, but there are others.
460+
*
461+
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
462+
*
463+
* @param string $html Input HTML fragment to process.
464+
* @return static|null The created processor if successful, otherwise null.
465+
*/
466+
public function create_fragment_at_current_node( string $html ) {
467+
if ( $this->get_token_type() !== '#tag' ) {
468+
return null;
469+
}
470+
471+
$namespace = $this->current_element->token->namespace;
472+
473+
/*
474+
* Prevent creating fragments at nodes that require a special tokenizer state.
475+
* This is unsupported by the HTML Processor.
476+
*/
477+
if (
478+
'html' === $namespace &&
479+
in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
480+
) {
481+
return null;
482+
}
483+
484+
$fragment_processor = static::create_fragment( $html );
485+
if ( null === $fragment_processor ) {
486+
return null;
487+
}
488+
489+
$fragment_processor->compat_mode = $this->compat_mode;
490+
491+
$fragment_processor->context_node = clone $this->state->current_token;
492+
$fragment_processor->context_node->bookmark_name = 'context-node';
493+
$fragment_processor->context_node->on_destroy = null;
494+
495+
$fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() );
496+
497+
$attribute_names = $this->get_attribute_names_with_prefix( '' );
498+
if ( null !== $attribute_names ) {
499+
foreach ( $attribute_names as $name ) {
500+
$fragment_processor->state->context_node[1][ $name ] = $this->get_attribute( $name );
501+
}
502+
}
503+
504+
$fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name );
505+
506+
if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) {
507+
$fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE;
508+
}
509+
510+
$fragment_processor->reset_insertion_mode_appropriately();
511+
512+
/*
513+
* > Set the parser's form element pointer to the nearest node to the context element that
514+
* > is a form element (going straight up the ancestor chain, and including the element
515+
* > itself, if it is a form element), if any. (If there is no such form element, the
516+
* > form element pointer keeps its initial value, null.)
517+
*/
518+
foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) {
519+
if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) {
520+
$fragment_processor->state->form_element = clone $element;
521+
$fragment_processor->state->form_element->bookmark_name = null;
522+
$fragment_processor->state->form_element->on_destroy = null;
523+
break;
524+
}
525+
}
526+
527+
$fragment_processor->state->encoding_confidence = 'irrelevant';
528+
529+
/*
530+
* Update the parsing namespace near the end of the process.
531+
* This is important so that any push/pop from the stack of open
532+
* elements does not change the parsing namespace.
533+
*/
534+
$fragment_processor->change_parsing_namespace(
535+
$this->current_element->token->integration_node_type ? 'html' : $namespace
536+
);
537+
538+
return $fragment_processor;
539+
}
540+
427541
/**
428542
* Stops the parser and terminates its execution when encountering unsupported markup.
429543
*

tests/phpunit/tests/html-api/wpHtmlProcessor.php

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,66 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to
10431043
$this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) );
10441044
}
10451045

1046+
/**
1047+
* @ticket 62357
1048+
*/
1049+
public function test_create_fragment_at_current_node_in_foreign_content() {
1050+
$processor = WP_HTML_Processor::create_full_parser( '<svg>' );
1051+
$this->assertTrue( $processor->next_tag( 'SVG' ) );
1052+
1053+
$fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte<rect /><circle></circle><foreignobject><div></div></foreignobject><g>" );
1054+
1055+
$this->assertSame( 'svg', $fragment->get_namespace() );
1056+
$this->assertTrue( $fragment->next_token() );
1057+
1058+
/*
1059+
* In HTML parsing, a nul byte would be ignored.
1060+
* In SVG it should be replaced with a replacement character.
1061+
*/
1062+
$this->assertSame( '#text', $fragment->get_token_type() );
1063+
$this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );
1064+
1065+
$this->assertTrue( $fragment->next_tag( 'RECT' ) );
1066+
$this->assertSame( 'svg', $fragment->get_namespace() );
1067+
1068+
$this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
1069+
$this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
1070+
$this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
1071+
$this->assertSame( 'svg', $fragment->get_namespace() );
1072+
}
1073+
1074+
/**
1075+
* @ticket 62357
1076+
*/
1077+
public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
1078+
$processor = WP_HTML_Processor::create_full_parser( '<svg><foreignObject>' );
1079+
$this->assertTrue( $processor->next_tag( 'foreignObject' ) );
1080+
1081+
$fragment = $processor->create_fragment_at_current_node( "<image>\0not-preceded-by-nul-byte<rect />" );
1082+
1083+
// Nothing has been processed, the html namespace should be used for parsing as an integration point.
1084+
$this->assertSame( 'html', $fragment->get_namespace() );
1085+
1086+
// HTML parsing transforms IMAGE into IMG.
1087+
$this->assertTrue( $fragment->next_tag( 'IMG' ) );
1088+
1089+
$this->assertTrue( $fragment->next_token() );
1090+
1091+
// In HTML parsing, the nul byte is ignored and the text is reached.
1092+
$this->assertSame( '#text', $fragment->get_token_type() );
1093+
$this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );
1094+
1095+
/*
1096+
* svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
1097+
* RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
1098+
*/
1099+
$this->assertTrue( $fragment->next_tag( 'RECT' ) );
1100+
$this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
1101+
$this->assertSame( 'html', $fragment->get_namespace() );
1102+
$this->assertTrue( $fragment->has_self_closing_flag() );
1103+
$this->assertTrue( $fragment->expects_closer() );
1104+
}
1105+
10461106
/**
10471107
* Ensure that lowercased tag_name query matches tags case-insensitively.
10481108
*

tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

Lines changed: 71 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,6 @@ public function data_external_html5lib_tests() {
138138
* @return bool True if the test case should be skipped. False otherwise.
139139
*/
140140
private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
141-
if ( null !== $test_context_element && 'body' !== $test_context_element ) {
142-
return true;
143-
}
144-
145141
if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) {
146142
return true;
147143
}
@@ -157,18 +153,79 @@ private static function should_skip_test( ?string $test_context_element, string
157153
* @return string|null Tree structure of parsed HTML, if supported, else null.
158154
*/
159155
private static function build_tree_representation( ?string $fragment_context, string $html ) {
160-
$processor = $fragment_context
161-
? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" )
162-
: WP_HTML_Processor::create_full_parser( $html );
163-
if ( null === $processor ) {
164-
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
156+
$processor = null;
157+
if ( $fragment_context ) {
158+
if ( 'body' === $fragment_context ) {
159+
$processor = WP_HTML_Processor::create_fragment( $html );
160+
} else {
161+
162+
/*
163+
* If the string of characters starts with "svg ", the context
164+
* element is in the SVG namespace and the substring after
165+
* "svg " is the local name. If the string of characters starts
166+
* with "math ", the context element is in the MathML namespace
167+
* and the substring after "math " is the local name.
168+
* Otherwise, the context element is in the HTML namespace and
169+
* the string is the local name.
170+
*/
171+
if ( str_starts_with( $fragment_context, 'svg ' ) ) {
172+
$tag_name = substr( $fragment_context, 4 );
173+
if ( 'svg' === $tag_name ) {
174+
$parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><svg>' );
175+
} else {
176+
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><svg><{$tag_name}>" );
177+
}
178+
$parent_processor->next_tag( $tag_name );
179+
} elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
180+
$tag_name = substr( $fragment_context, 5 );
181+
if ( 'math' === $tag_name ) {
182+
$parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><math>' );
183+
} else {
184+
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><math><{$tag_name}>" );
185+
}
186+
$parent_processor->next_tag( $tag_name );
187+
} else {
188+
if ( in_array(
189+
$fragment_context,
190+
array(
191+
'caption',
192+
'col',
193+
'colgroup',
194+
'tbody',
195+
'td',
196+
'tfoot',
197+
'th',
198+
'thead',
199+
'tr',
200+
),
201+
true
202+
) ) {
203+
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><table><{$fragment_context}>" );
204+
$parent_processor->next_tag();
205+
} else {
206+
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><{$fragment_context}>" );
207+
}
208+
$parent_processor->next_tag( $fragment_context );
209+
}
210+
if ( null !== $parent_processor->get_unsupported_exception() ) {
211+
throw $parent_processor->get_unsupported_exception();
212+
}
213+
if ( null !== $parent_processor->get_last_error() ) {
214+
throw new Exception( $parent_processor->get_last_error() );
215+
}
216+
$processor = $parent_processor->create_fragment_at_current_node( $html );
217+
}
218+
219+
if ( null === $processor ) {
220+
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
221+
}
222+
} else {
223+
$processor = WP_HTML_Processor::create_full_parser( $html );
224+
if ( null === $processor ) {
225+
throw new Exception( 'Could not create a full parser.' );
226+
}
165227
}
166228

167-
/*
168-
* The fragment parser will start in 2 levels deep at: html > body > [position]
169-
* and requires adjustment to initial parameters.
170-
* The full parser will not.
171-
*/
172229
$output = '';
173230
$indent_level = 0;
174231
$was_text = null;

0 commit comments

Comments
 (0)