@@ -611,11 +611,44 @@ function wpautop( $text, $br = true ) {
611611 * No effort is made to clean up, sanitize, or normalize the segments
612612 * of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
613613 *
614+ * Consider using the HTML API directly instead of relying on this
615+ * legacy function: it bloats memory by default and provides a text
616+ * interface for working with HTML whereas the HTML API provides a
617+ * low-overhead and convenient structural interface.
618+ *
619+ * ## Output format:
620+ *
621+ * To maintain legacy behaviors with this function from when it
622+ * operated via {@see preg_split()}, the output array injects text
623+ * nodes which do not appear in the source HTML. That is, the original
624+ * array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text
625+ * span on each side of every tag-like or comment-like “delimiter” in
626+ * the matched string.
627+ *
628+ * Therefore, the output array will always start and end with text nodes
629+ * and will separate every non-text node with a text node. If there is no
630+ * actual content in the interstitial space between tokens in the source
631+ * document, an empty text node will be created.
632+ *
633+ * Example:
634+ *
635+ * array( '', '<img>', '' ) === wp_html_split( '<img>' );
636+ * array( 'test' ) === wp_html_split( 'test' );
637+ * array( '', '<p>', 'test' ) === wp_html_split( '<p>test' );
638+ * array( 'test', '</p>', '' ) === wp_html_split( 'test</p>' );
639+ *
640+ * array( '', '<br>', '', '<!-- comment -->', '' ) === wp_html_split( '<br><!-- comment -->' );
641+ *
642+ * // To avoid ambiguity, leading less-than signs (<) in text nodes are encoded.
643+ * array( '<3' ) === wp_split_html( '<3' );
644+ *
614645 * @since 4.2.4
615- * @since {WP_VERSION} Reliably parses HTML via the HTML API.
646+ * @since 6.9.0 Reliably parses HTML via the HTML API.
616647 *
617- * @param string $input The text which has to be formatted.
618- * @return string[] Array of the formatted text.
648+ * @param string $input HTML document to split, one item for every token.
649+ * These can be text nodes, tags, comments, or doctype declarations.
650+ * @return string[] Tokens from input; starting and ending in a text node, and with text
651+ * nodes between every non-text node (see docblock note).
619652 */
620653function wp_html_split ( $ input ) {
621654 $ token_reporter = new class ( $ input ) extends WP_HTML_Tag_Processor {
@@ -627,9 +660,33 @@ public function extract_raw_token() {
627660 }
628661 };
629662
630- $ tokens = array ();
663+ $ tokens = array ();
664+ $ was_text = false ;
631665 while ( $ token_reporter ->next_token () ) {
632- $ tokens [] = $ token_reporter ->extract_raw_token ();
666+ $ raw_token = $ token_reporter ->extract_raw_token ();
667+ $ is_text = '#text ' === $ token_reporter ->get_token_name ();
668+
669+ if ( ! $ is_text && ! $ was_text ) {
670+ $ tokens [] = '' ;
671+ }
672+
673+ /*
674+ * Some legacy code assumes that text nodes will never start with a
675+ * less-than sign (<) but this isn’t the case, as some text nodes do
676+ * if the less-than sign doesn’t introduce a syntax token. To avoid
677+ * further corruption a leading less-than sign is replaced by its
678+ * encoded equivalent numeric character reference.
679+ */
680+ if ( $ is_text && '< ' === ( $ raw_token [0 ] ?? '' ) ) {
681+ $ raw_token = '< ' . substr ( $ raw_token , 1 );
682+ }
683+
684+ $ tokens [] = $ raw_token ;
685+ $ was_text = $ is_text ;
686+ }
687+
688+ if ( ! $ was_text ) {
689+ $ tokens [] = '' ;
633690 }
634691
635692 return $ tokens ;
0 commit comments