diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index ab0b88693ab2e..05b8a021733cc 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -317,6 +317,16 @@ class WP_HTML_Tag_Processor { */ private $stop_on_tag_closers; + /** + * Whether to visit funky comments, e.g. 1>, when walking an input document. + * + * These are funny because they are errors. + * + * @since 6.3.0 + * @var bool + */ + private $stop_on_funky_comments; + /** * Holds updated HTML as updates are applied. * @@ -538,6 +548,18 @@ class WP_HTML_Tag_Processor { */ protected $seek_count = 0; + /** + * @since 6.3.0 + * @var string + */ + private $funky_comment_content = null; + + /** + * @since 6.3.0 + * @var int + */ + private $placeholders = 0; + /** * Constructor. * @@ -971,6 +993,7 @@ private function skip_script_data() { * closing `>`; these are left for other methods. * * @since 6.2.0 + * @since 6.2.1 Passes over invalid-tag-closer-comments like "3 this is a comment>". * * @return bool Whether a tag was found before the end of the document. */ @@ -1035,17 +1058,42 @@ private function parse_next_tag() { * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( - strlen( $html ) > $at + 3 && + strlen( $html ) > $at + 4 && '-' === $html[ $at + 2 ] && '-' === $html[ $at + 3 ] ) { - $closer_at = strpos( $html, '-->', $at + 4 ); - if ( false === $closer_at ) { - return false; + $closer_at = $at + 4; + + // Abruptly-closed empty comments are a sequence of dashes followed by `>`. + $span_of_dashes = strspn( $html, '-', $closer_at ); + if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { + $at = $closer_at + $span_of_dashes + 1; + continue; } - $at = $closer_at + 3; - continue; + /* + * Comments may be closed by either a --> or an invalid --!>. + * The first occurrence closes the comment. + * + * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment + */ + $closer_at--; // Pre-increment inside condition below reduces risk of accidental infinite looping. + while ( ++$closer_at < strlen( $html ) ) { + $closer_at = strpos( $html, '--', $closer_at ); + if ( false === $closer_at ) { + return false; + } + + if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) { + $at = $closer_at + 3; + continue 2; + } + + if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { + $at = $closer_at + 4; + continue 2; + } + } } /* @@ -1104,9 +1152,19 @@ private function parse_next_tag() { continue; } + /* + * > is a missing end tag name, which is ignored. + * + * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name + */ + if ( '>' === $html[ $at + 1 ] ) { + $at++; + continue; + } + /* * transitions to a bogus comment state – skip to the nearest > - * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( '?' === $html[ $at + 1 ] ) { $closer_at = strpos( $html, '>', $at + 2 ); @@ -1118,6 +1176,32 @@ private function parse_next_tag() { continue; } + /* + * If a non-alpha starts the tag name in a tag closer it's a comment. + * Find the first `>`, which closes the comment. + * + * See https://github.com/WordPress/wordpress-develop/pull/4256 + */ + if ( $this->is_closing_tag ) { + $closer_at = strpos( $html, '>', $at ); + if ( false === $closer_at ) { + return false; + } + + if ( $this->stop_on_funky_comments ) { + ++$at; + $this->tag_name_length = 0; + $this->tag_name_starts_at = $at; + $this->bytes_already_parsed = $closer_at; + $this->funky_comment_content = array( $at, $closer_at ); + + return true; + } + + $at = $closer_at + 1; + continue; + } + ++$at; } @@ -1249,11 +1333,12 @@ private function skip_whitespace() { private function after_tag() { $this->class_name_updates_to_attributes_updates(); $this->apply_attributes_updates(); - $this->tag_name_starts_at = null; - $this->tag_name_length = null; - $this->tag_ends_at = null; - $this->is_closing_tag = null; - $this->attributes = array(); + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->tag_ends_at = null; + $this->is_closing_tag = null; + $this->attributes = array(); + $this->funky_comment_content = null; } /** @@ -1500,7 +1585,7 @@ public function seek( $bookmark_name ) { if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { _doing_it_wrong( __METHOD__, - __( 'Unknown bookmark name.' ), + __( 'Unknown bookmark name.' . ' ' . $bookmark_name ), '6.2.0' ); return false; @@ -1525,6 +1610,14 @@ public function seek( $bookmark_name ) { return $this->next_tag( array( 'tag_closers' => 'visit' ) ); } + public function rewind() { +// $this->get_updated_html(); + $this->after_tag(); + $this->bytes_already_parsed = 0; + $this->bytes_already_copied = 0; + $this->output_buffer = ''; + } + /** * Compare two WP_HTML_Text_Replacement objects. * @@ -1805,6 +1898,13 @@ public function is_tag_closer() { return $this->is_closing_tag; } + /** + * @since 6.3.0 + */ + public function is_funky_comment() { + return null !== $this->funky_comment_content; + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * @@ -2061,6 +2161,13 @@ public function __toString() { return $this->get_updated_html(); } + public function get_funky_content() { + if ( $this->funky_comment_content !== null ) { + list( $start, $end ) = $this->funky_comment_content; + return substr( $this->html, $start, $end - $start ); + } + } + /** * Returns the string representation of the HTML Tag Processor. * @@ -2152,11 +2259,12 @@ private function parse_query( $query ) { return; } - $this->last_query = $query; - $this->sought_tag_name = null; - $this->sought_class_name = null; - $this->sought_match_offset = 1; - $this->stop_on_tag_closers = false; + $this->last_query = $query; + $this->sought_tag_name = null; + $this->sought_class_name = null; + $this->sought_match_offset = 1; + $this->stop_on_tag_closers = false; + $this->stop_on_funky_comments = false; // A single string value means "find the tag of this name". if ( is_string( $query ) ) { @@ -2194,8 +2302,177 @@ private function parse_query( $query ) { if ( isset( $query['tag_closers'] ) ) { $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; } + + if ( isset( $query['funky_comments'] ) ) { + $this->stop_on_funky_comments = 'visit' === $query['funky_comments']; + } + } + + public function declarative_match( $pattern_html ) { + $this->placeholders = 0; + while ( $this->placeholders > 0 ) { + $this->release_bookmark( "__placeholder_{$this->placeholders}" ); + $this->placeholders--; + } + $pattern = new WP_HTML_Tag_Processor( $pattern_html ); + $visit_everything = array( 'tag_closers' => 'visit', 'funky_comments' => 'visit' ); + + $same_thing = function ( WP_HTML_Tag_Processor $pattern, WP_HTML_Tag_Processor $test ) { + if ( $pattern->is_funky_comment() ) { + $this->placeholders++; + $this->set_bookmark( "__placeholder_{$this->placeholders}" ); + return true; + } + + if ( ! ( + $pattern->get_tag() === $test->get_tag() && + $pattern->is_tag_closer() === $test->is_tag_closer() && + $pattern->is_funky_comment() === $test->is_funky_comment() + ) ) { + return false; + } + + $attribute_constraints = $pattern->get_attribute_names_with_prefix( '' ); + if ( null === $attribute_constraints ) { + return true; + } + + foreach ( $attribute_constraints as $name ) { + if ( $pattern->get_attribute( $name ) !== $test->get_attribute( $name ) ) { + return false; + } + } + + return true; + }; + + step_one: // find the next spot the patterns start the same. + if ( ! $pattern->next_tag( $visit_everything ) ) { + return false; + } + + while ( $this->placeholders > 0 ) { + $this->release_bookmark( "__placeholder_{$this->placeholders}" ); + $this->placeholders--; + } + while ( $this->next_tag( $visit_everything ) ) { + if ( $same_thing( $pattern, $this ) ) { + goto step_two; + } + } + return false; + + step_two: // see if the subsequence tokens in the pattern and test match. + $this->set_bookmark( 'match_start' ); + if ( ! $pattern->next_tag( $visit_everything ) ) { + $this->release_bookmark( 'match_start' ); + return true; + } + + while ( true ) { + if ( ! $this->next_tag( $visit_everything ) ) { + $this->release_bookmark( 'match_start' ); + return false; + } + + if ( ! $same_thing( $pattern, $this ) ) { + $pattern->rewind(); + goto step_one; + } + + if ( ! $pattern->next_tag( $visit_everything ) ) { + break; + } + } + + $this->set_bookmark( 'match_end' ); + $this->seek( 'match_start' ); + $this->release_bookmark( 'match_start' ); + return true; } + /** + * @since 6.3.0 + */ + public function transform( $pattern_html, $transformer_html ) { + $visit_everything = array( 'tag_closers' => 'visit', 'funky_comments' => 'visit' ); + $pattern = new WP_HTML_Tag_Processor( $pattern_html ); + $transform = new WP_HTML_Tag_Processor( $transformer_html ); + + if ( + ! $pattern->next_tag( $visit_everything ) || + ! $transform->next_tag( $visit_everything ) || + ! $this->declarative_match( $pattern_html ) + ) { + return false; + } + + $this->set_bookmark( 'match_start' ); + + $same_thing = function ( WP_HTML_Tag_Processor $pattern, WP_HTML_Tag_Processor $test ) { + if ( $pattern->is_funky_comment() ) { +// $this->placeholders++; +// $this->set_bookmark( "__placeholder_{$this->placeholders}" ); + return true; + } + + if ( ! ( + $pattern->get_tag() === $test->get_tag() && + $pattern->is_tag_closer() === $test->is_tag_closer() && + $pattern->is_funky_comment() === $test->is_funky_comment() + ) ) { + return false; + } + + $attribute_constraints = $pattern->get_attribute_names_with_prefix( '' ); + if ( null === $attribute_constraints ) { + return true; + } + + foreach ( $attribute_constraints as $name ) { + if ( $pattern->get_attribute( $name ) !== $test->get_attribute( $name ) ) { + return false; + } + } + + return true; + }; + + $budget = 10; + while ( $budget-- ) { + if ( $same_thing( $pattern, $transform ) ) { + if ( ! $transform->next_tag( $visit_everything ) ) { + goto drop_patterns; + } + $this->next_tag( $visit_everything ); + $pattern->next_tag( $visit_everything ); + continue; + } + + $this->set_bookmark( 'here' ); + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->bookmarks['here']->start, + $this->bookmarks['here']->end + 1, + '' + ); + var_dump( substr( $this->html, $this->bookmarks['here']->start, $this->bookmarks['here']->end - $this->bookmarks['here']->start + 1 ) ); + $this->get_updated_html(); + $pattern->next_tag( $visit_everything ); + $this->next_tag( $visit_everything ); + } + + drop_patterns: + while ( $pattern->next_tag( $visit_everything ) ) { + $this->set_bookmark( 'here' ); + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->bookmarks['here']->start, + $this->bookmarks['here']->end + 1, + '' + ); + var_dump( substr( $this->html, $this->bookmarks['here']->start, $this->bookmarks['here']->end - $this->bookmarks['here']->start + 1 ) ); + $this->next_tag( $visit_everything ); + } + } /** * Checks whether a given tag and its attributes match the search criteria. @@ -2205,6 +2482,10 @@ private function parse_query( $query ) { * @return boolean Whether the given tag and its attribute match the search criteria. */ private function matches() { + if ( null !== $this->funky_comment_content && $this->stop_on_funky_comments ) { + return true; + } + if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 6b0c263c5dad6..9ad7274e0e722 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -6,6 +6,23 @@ * @subpackage HTML-API */ +if ( ! class_exists( 'WP_UnitTestCase' ) ) { + class WP_UnitTestCase extends PHPUnit\Framework\TestCase {} + + require_once '/Users/dmsnell/code/WordPress-develop/src/wp-includes/html-api/class-wp-html-attribute-token.php'; + require_once '/Users/dmsnell/code/WordPress-develop/src/wp-includes/html-api/class-wp-html-span.php'; +// require_once '/Users/dmsnell/code/WordPress-develop/src/wp-includes/html-api/class-wp-html-spec.php'; + require_once '/Users/dmsnell/code/WordPress-develop/src/wp-includes/html-api/class-wp-html-text-replacement.php'; + require_once '/Users/dmsnell/code/WordPress-develop/src/wp-includes/html-api/class-wp-html-tag-processor.php'; +// require_once '/Users/dmsnell/code/WordPress-develop/src/wp-includes/html-api/class-wp-html-processor.php'; + + function esc_attr( $s ) { return str_replace( [ '<', '>', '"' ], [ '<', '>', '"' ], $s ); } + function __( $s ) { return $s; } + function _doing_it_wrong( ...$args ) { + var_dump( $args ); + } +} + /** * @group html-api * @@ -1714,6 +1731,47 @@ public function data_next_tag_ignores_script_tag_contents() { ); } + /** + * Invalid tag names are comments on tag closers. + * + * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name + * + * @ticket 58007 + * + * @dataProvider data_next_tag_ignores_invalid_first_character_of_tag_name_comments + * + * @param string $html_with_markers HTML containing an invalid tag closer whose element before and + * element after contain the "start" and "end" CSS classes. + */ + public function test_next_tag_ignores_invalid_first_character_of_tag_name_comments( $html_with_markers ) { + $p = new WP_HTML_Tag_Processor( $html_with_markers ); + $p->next_tag( array( 'class_name' => 'start' ) ); + $p->next_tag(); + + $this->assertSame( 'end', $p->get_attribute( 'class' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_next_tag_ignores_invalid_first_character_of_tag_name_comments() { + return array( + 'Invalid tag openers as normal text' => array( + '
This is really cool!
Just a thought

