@@ -605,21 +605,41 @@ function wpautop( $text, $br = true ) {
605605}
606606
607607/**
608- * Separates HTML elements and comments from the text.
608+ * Splits an HTML input into an array of raw strings, where each token
609+ * represents a tag, a comment, a text node, etc…
610+ *
611+ * No effort is made to clean up, sanitize, or normalize the segments
612+ * of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
609613 *
610614 * @since 4.2.4
615+ * @since {WP_VERSION} Reliably parses HTML via the HTML API.
611616 *
612617 * @param string $input The text which has to be formatted.
613618 * @return string[] Array of the formatted text.
614619 */
615620function wp_html_split ( $ input ) {
616- return preg_split ( get_html_split_regex (), $ input , -1 , PREG_SPLIT_DELIM_CAPTURE );
621+ $ token_reporter = new class ( $ input ) extends WP_HTML_Tag_Processor {
622+ public function extract_raw_token () {
623+ $ this ->set_bookmark ( 'here ' );
624+ $ here = $ this ->bookmarks ['here ' ];
625+
626+ return substr ( $ this ->html , $ here ->start , $ here ->length );
627+ }
628+ };
629+
630+ $ tokens = array ();
631+ while ( $ token_reporter ->next_token () ) {
632+ $ tokens [] = $ token_reporter ->extract_raw_token ();
633+ }
634+
635+ return $ tokens ;
617636}
618637
619638/**
620639 * Retrieves the regular expression for an HTML element.
621640 *
622641 * @since 4.4.0
642+ * @deprecated {WP_VERSION} Use the HTML API instead.
623643 *
624644 * @return string The regular expression.
625645 */
0 commit comments