From 38c29c89c14f6fa35e2526ed9c6d833628e566d0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 01:57:45 -0700 Subject: [PATCH 01/11] HTML API: Simplify breadcrumb accounting. Since the HTML Processor started visiting all nodes in a document, both real and virtual, the breadcrumb accounting became a bit complicated and it's not entirely clear that it is fully reliable. In this patch the breadcrumbs are rebuilt separately from the stack of open elements in order to eliminate the problem of the stateful stack interactions and the post-hoc event queue. Breadcrumbs are greatly simplified as a result, and more verifiably correct, in this construction. --- .../html-api/class-wp-html-processor.php | 126 +++++++----------- .../html-api/wpHtmlProcessorSemanticRules.php | 11 +- 2 files changed, 61 insertions(+), 76 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 588d2fbe7d7c9..8c75bd3f06b21 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -211,6 +211,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $element_queue = array(); + /** + * Stores the current breadcrumbs. + * + * @since 6.7.0 + * + * @var string[] + */ + private $breadcrumbs = array(); + /** * Current stack event, if set, representing a matched token. * @@ -310,8 +319,8 @@ public static function create_fragment( $html, $context = '', $encoding = false ); - $processor->state->stack_of_open_elements->push( $context_node ); $processor->context_node = $context_node; + $processor->breadcrumbs = array( 'HTML', $context_node->node_name ); return $processor; } @@ -523,44 +532,46 @@ public function next_token() { return false; } - if ( 'done' !== $this->has_seen_context_node && 0 === count( $this->element_queue ) && ! $this->step() ) { - while ( 'context-node' !== $this->state->stack_of_open_elements->current_node()->bookmark_name && $this->state->stack_of_open_elements->pop() ) { - continue; - } - $this->has_seen_context_node = 'done'; - return $this->next_token(); + /* + * Prime the events if there are none. + * + * @todo In some cases, probably related to the adoption agency + * algorithm, this call to step() doesn't create any new + * events. Calling it again creates them. Figure out why + * this is and if it's inherent or if it's a bug. Looping + * until there are events or until there are no more + * tokens works in the meantime and isn't obviously wrong. + */ + while ( empty( $this->element_queue ) && $this->step() ) { + continue; } + // Process the next event on the queue. $this->current_element = array_shift( $this->element_queue ); - while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) { - if ( isset( $this->current_element ) ) { - if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $this->has_seen_context_node = true; - return $this->next_token(); - } - } - $this->current_element = array_shift( $this->element_queue ); + if ( ! isset( $this->current_element ) ) { + return false; } - if ( ! isset( $this->current_element ) ) { - if ( 'done' === $this->has_seen_context_node ) { - return false; - } else { - return $this->next_token(); - } + $is_pop = WP_HTML_Stack_Event::POP === $this->current_element->operation; + + /* + * The root node only exists in the fragment parser, and closing it + * indicates that the parse is complete. Stop before popping if from + * the breadcrumbs. + */ + if ( 'root-node' === $this->current_element->token->bookmark_name ) { + return ! $is_pop && $this->next_token(); } - if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) { - $this->element_queue = array(); - $this->current_element = null; - return false; + // Adjust the breadcrumbs for this event. + if ( $is_pop ) { + array_pop( $this->breadcrumbs ); + } else { + $this->breadcrumbs[] = $this->current_element->token->node_name; } // Avoid sending close events for elements which don't expect a closing. - if ( - WP_HTML_Stack_Event::POP === $this->current_element->operation && - ! static::expects_closer( $this->current_element->token ) - ) { + if ( $is_pop && ! static::expects_closer( $this->current_element->token ) ) { return $this->next_token(); } @@ -643,10 +654,11 @@ public function matches_breadcrumbs( $breadcrumbs ) { return false; } - foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + for ( $i = count( $this->breadcrumbs ) - 1; $i >= 0; $i-- ) { + $node = $this->breadcrumbs[ $i ]; $crumb = strtoupper( current( $breadcrumbs ) ); - if ( '*' !== $crumb && $node->node_name !== $crumb ) { + if ( '*' !== $crumb && $node !== $crumb ) { return false; } @@ -862,46 +874,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - $breadcrumbs = array(); - - foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { - $breadcrumbs[] = $stack_item->node_name; - } - - if ( ! $this->is_virtual() ) { - return $breadcrumbs; - } - - foreach ( $this->element_queue as $queue_item ) { - if ( $this->current_element->token->bookmark_name === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'context-node' === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'real' === $queue_item->provenance ) { - break; - } - - if ( WP_HTML_Stack_Event::PUSH === $queue_item->operation ) { - $breadcrumbs[] = $queue_item->token->node_name; - } else { - array_pop( $breadcrumbs ); - } - } - - if ( null !== parent::get_token_name() && ! parent::is_tag_closer() ) { - array_pop( $breadcrumbs ); - } - - // Add the virtual node we're at. - if ( WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $breadcrumbs[] = $this->current_element->token->node_name; - } - - return $breadcrumbs; + return $this->breadcrumbs; } /** @@ -930,9 +903,7 @@ public function get_breadcrumbs() { * @return int Nesting-depth of current location in the document. */ public function get_current_depth() { - return $this->is_virtual() - ? count( $this->get_breadcrumbs() ) - : $this->state->stack_of_open_elements->count(); + return count( $this->breadcrumbs ); } /** @@ -2552,7 +2523,6 @@ public function seek( $bookmark_name ) { ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start : 0; $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; - $bookmark_length = $this->bookmarks[ $actual_bookmark_name ]->length; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; /* @@ -2610,6 +2580,12 @@ public function seek( $bookmark_name ) { $this->state->frameset_ok = true; $this->element_queue = array(); $this->current_element = null; + + if ( isset( $this->context_node ) ) { + $this->breadcrumbs = array_slice( $this->breadcrumbs, 0, 2 ); + } else { + $this->breadcrumbs = array(); + } } // When moving forwards, reparse the document until reaching the same location as the original bookmark. diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php index 717276935a780..adce614506429 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php @@ -387,7 +387,16 @@ public function test_in_body_any_other_end_tag_with_unclosed_non_special_element $this->assertSame( 'CODE', $processor->get_tag(), "Expected to start test on CODE element but found {$processor->get_tag()} instead." ); $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'SPAN', 'CODE' ), $processor->get_breadcrumbs(), 'Failed to produce expected DOM nesting.' ); - $this->assertTrue( $processor->next_token(), 'Failed to advance past CODE tag to expected SPAN closer.' ); + $this->assertTrue( + $processor->next_tag( + array( + 'tag_name' => 'SPAN', + 'tag_closers' => 'visit', + ) + ), + 'Failed to advance past CODE tag to expected SPAN closer.' + ); + $this->assertSame( 'SPAN', $processor->get_tag() ); $this->assertTrue( $processor->is_tag_closer(), 'Expected to find closing SPAN, but found opener instead.' ); $this->assertSame( array( 'HTML', 'BODY', 'DIV' ), $processor->get_breadcrumbs(), 'Failed to advance past CODE tag to expected DIV opener.' ); From e11f9eeb7f1861b9c5096fab65e59790cbf0950b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 16:25:15 -0700 Subject: [PATCH 02/11] HTML API: Expand Unsupported class and make it available for debugging. The HTML Processor internally throws an exception when it reaches HTML that it knows it cannot process, but this exception is not made available to calling code. It can be useful to extract more knowledge about why it gave up, especially for debugging purposes. In this patch, more context is added to the WP_HTML_Unsupported_Exception and the last exception is made available to calling code, if it asks. --- .../html-api/class-wp-html-processor.php | 65 +++++++++++++++++++ .../class-wp-html-unsupported-exception.php | 18 ++++- 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8c75bd3f06b21..f3be6c67064b9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -188,6 +188,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $last_error = null; + /** + * Stores context for why the parser bailed on unsupported HTML, if it did. + * + * @see self::get_unsupported_exception + * + * @since 6.7.0 + * + * @var WP_HTML_Unsupported_Exception|null + */ + private $unsupported_exception = null; + /** * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * @@ -384,6 +395,45 @@ function ( WP_HTML_Token $token ) { }; } + /** + * Stops the parser and terminates its execution when encountering unsupported markup. + * + * @throws WP_HTML_Unsupported_Exception Halts execution of the parser. + * + * @since 6.7.0 + * + * @param string $message Explains support is missing in order to parse the current node. + * + * @return mixed + */ + private function bail( string $message ) { + $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + $token = substr( $this->html, $here->start, $here->length ); + + $open_elements = array(); + foreach ( $this->state->stack_of_open_elements->stack as $item ) { + $open_elements[] = $item->node_name; + } + + $active_formats = array(); + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + $active_formats[] = $item->node_name; + } + + $this->last_error = self::ERROR_UNSUPPORTED; + + $this->unsupported_exception = new WP_HTML_Unsupported_Exception( + $message, + $this->state->current_token->node_name, + $here->start, + $token, + $open_elements, + $active_formats + ); + + throw $this->unsupported_exception; + } + /** * Returns the last error, if any. * @@ -411,6 +461,21 @@ public function get_last_error() { return $this->last_error; } + /** + * Returns context for why the parser aborted due to unsupported HTML, if it did. + * + * This is meant for debugging purposes, not for production use. + * + * @since 6.7.0 + * + * @see self::$unsupported_exception + * + * @return WP_HTML_Unsupported_Exception|null + */ + public function get_unsupported_exception() { + return $this->unsupported_exception; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php index 6e7228670bf8b..1a29714727623 100644 --- a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php +++ b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php @@ -1,4 +1,4 @@ -token_name = $token_name; + $this->token_at = $token_at; + $this->token = $token; + + $this->stack_of_open_elements = $stack_of_open_elements; + $this->active_formatting_elements = $active_formatting_elements; + } } From e97f678c594ec7a5495fde639fea1a05bb6a48dc Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 2 Jul 2024 20:46:25 -0700 Subject: [PATCH 03/11] HTML API: Support more of the adoption agency algorithm. --- .../html-api/class-wp-html-open-elements.php | 17 ++- .../html-api/class-wp-html-processor.php | 135 ++++++++++++++---- 2 files changed, 122 insertions(+), 30 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-open-elements.php b/src/wp-includes/html-api/class-wp-html-open-elements.php index d1585cdea5bf5..66b7561d0e686 100644 --- a/src/wp-includes/html-api/class-wp-html-open-elements.php +++ b/src/wp-includes/html-api/class-wp-html-open-elements.php @@ -470,12 +470,23 @@ public function remove_node( $token ) { * see WP_HTML_Open_Elements::walk_up(). * * @since 6.4.0 + * @since 6.7.0 Accepts $below_this_node to start traversal below a given node, if it exists. + * + * @param ?WP_HTML_Token $below_this_node Start traversing below this node, if provided and if the node exists. */ - public function walk_down() { - $count = count( $this->stack ); + public function walk_down( $below_this_node = null ) { + $has_found_node = null === $below_this_node; + $count = count( $this->stack ); for ( $i = 0; $i < $count; $i++ ) { - yield $this->stack[ $i ]; + $node = $this->stack[ $i ]; + + if ( ! $has_found_node ) { + $has_found_node = $node === $below_this_node; + continue; + } + + yield $node; } } diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f3be6c67064b9..cf352ca82056e 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3103,7 +3103,7 @@ private function run_adoption_agency_algorithm() { if ( // > If the current node is an HTML element whose tag name is subject - $current_node && $subject === $current_node->node_name && + isset( $current_node ) && $subject === $current_node->node_name && // > the current node is not in the list of active formatting elements ! $this->state->active_formatting_elements->contains_node( $current_node ) ) { @@ -3111,12 +3111,7 @@ private function run_adoption_agency_algorithm() { return; } - $outer_loop_counter = 0; - while ( $budget-- > 0 ) { - if ( $outer_loop_counter++ >= 8 ) { - return; - } - + for ( $outer_loop_counter = 0; $outer_loop_counter < 8; $outer_loop_counter++ ) { /* * > Let formatting element be the last element in the list of active formatting elements that: * > - is between the end of the list and the last marker in the list, @@ -3137,8 +3132,35 @@ private function run_adoption_agency_algorithm() { // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. if ( null === $formatting_element ) { - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when "any other end tag" is required.' ); + /* + * > Any other end tag + */ + + /* + * Find the corresponding tag opener in the stack of open elements, if + * it exists before reaching a special element, which provides a kind + * of boundary in the stack. For example, a `` should not + * close anything beyond its containing `P` or `DIV` element. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + if ( $subject === $node->node_name ) { + break; + } + + if ( self::is_special( $node->node_name ) ) { + // This is a parse error, ignore the token. + return; + } + } + + $this->generate_implied_end_tags( $subject ); + + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + return; + } + } } // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. @@ -3152,22 +3174,16 @@ private function run_adoption_agency_algorithm() { return; } + /* + * > If formatting element is not the current node, this is a parse error. (But do not return.) + */ + /* * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack * > than formatting element, and is an element in the special category. There might not be one. */ - $is_above_formatting_element = true; - $furthest_block = null; - foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { - if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) { - continue; - } - - if ( $is_above_formatting_element ) { - $is_above_formatting_element = false; - continue; - } - + $furthest_block = null; + foreach ( $this->state->stack_of_open_elements->walk_down( $formatting_element ) as $item ) { if ( self::is_special( $item->node_name ) ) { $furthest_block = $item; break; @@ -3183,19 +3199,84 @@ private function run_adoption_agency_algorithm() { foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { $this->state->stack_of_open_elements->pop(); - if ( $formatting_element->bookmark_name === $item->bookmark_name ) { + if ( $formatting_element === $item ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } } } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot extract common ancestor in adoption agency algorithm.' ); - } + /* + * > Let common ancestor be the element immediately above formatting element in the stack of open elements. + */ + $common_ancestor = null; + foreach ( $this->state->stack_of_open_elements->walk_up( $formatting_element ) as $item ) { + $common_ancestor = $item; + break; + } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when looping required.' ); + /* + * Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list. + */ + $formatting_element_index = 0; + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + if ( $formatting_element === $item ) { + break; + } + + ++$formatting_element_index; + } + + /* + * > Let node and last node be furthest block. + */ + $node = $furthest_block; + $last_node = $furthest_block; + + $inner_loop_counter = 0; + while ( $budget-- > 0 ) { + ++$inner_loop_counter; + + if ( $this->state->stack_of_open_elements->contains_node( $node ) ) { + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + } else { + $this->bail( 'Cannot adjust node pointer above removed node.' ); + } + + if ( $formatting_element === $node ) { + break; + } + + if ( $inner_loop_counter > 3 && $this->state->active_formatting_elements->contains_node( $node ) ) { + $this->state->active_formatting_elements->remove_node( $node ); + } + + if ( ! $this->state->active_formatting_elements->contains_node( $node ) ) { + $this->state->stack_of_open_elements->remove_node( $node ); + continue; + } + + /* + * > Create an element for the token for which the element node was created, + * in the HTML namespace, with common ancestor as the intended parent; + * replace the entry for node in the list of active formatting elements + * with an entry for the new element, replace the entry for node in the + * stack of open elements with an entry for the new element, and let node + * be the new element. + */ + $this->bail( 'Cannot create and reference new element for which no token exists.' ); + } + + /* + * > Insert whatever last node ended up being in the previous step at the appropriate + * > palce for inserting a node, but using common ancestor as the override target. + */ + + $this->bail( 'Cannot create and reference new element for which no token exists.' ); + } } /** From 2b2d6fe93b2af0395a46bbfe3b51ad32d780d3b9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 01:57:45 -0700 Subject: [PATCH 04/11] HTML API: Simplify breadcrumb accounting. Since the HTML Processor started visiting all nodes in a document, both real and virtual, the breadcrumb accounting became a bit complicated and it's not entirely clear that it is fully reliable. In this patch the breadcrumbs are rebuilt separately from the stack of open elements in order to eliminate the problem of the stateful stack interactions and the post-hoc event queue. Breadcrumbs are greatly simplified as a result, and more verifiably correct, in this construction. --- .../html-api/class-wp-html-processor.php | 126 +++++++----------- .../html-api/wpHtmlProcessorSemanticRules.php | 11 +- 2 files changed, 61 insertions(+), 76 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 588d2fbe7d7c9..8c75bd3f06b21 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -211,6 +211,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $element_queue = array(); + /** + * Stores the current breadcrumbs. + * + * @since 6.7.0 + * + * @var string[] + */ + private $breadcrumbs = array(); + /** * Current stack event, if set, representing a matched token. * @@ -310,8 +319,8 @@ public static function create_fragment( $html, $context = '', $encoding = false ); - $processor->state->stack_of_open_elements->push( $context_node ); $processor->context_node = $context_node; + $processor->breadcrumbs = array( 'HTML', $context_node->node_name ); return $processor; } @@ -523,44 +532,46 @@ public function next_token() { return false; } - if ( 'done' !== $this->has_seen_context_node && 0 === count( $this->element_queue ) && ! $this->step() ) { - while ( 'context-node' !== $this->state->stack_of_open_elements->current_node()->bookmark_name && $this->state->stack_of_open_elements->pop() ) { - continue; - } - $this->has_seen_context_node = 'done'; - return $this->next_token(); + /* + * Prime the events if there are none. + * + * @todo In some cases, probably related to the adoption agency + * algorithm, this call to step() doesn't create any new + * events. Calling it again creates them. Figure out why + * this is and if it's inherent or if it's a bug. Looping + * until there are events or until there are no more + * tokens works in the meantime and isn't obviously wrong. + */ + while ( empty( $this->element_queue ) && $this->step() ) { + continue; } + // Process the next event on the queue. $this->current_element = array_shift( $this->element_queue ); - while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) { - if ( isset( $this->current_element ) ) { - if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $this->has_seen_context_node = true; - return $this->next_token(); - } - } - $this->current_element = array_shift( $this->element_queue ); + if ( ! isset( $this->current_element ) ) { + return false; } - if ( ! isset( $this->current_element ) ) { - if ( 'done' === $this->has_seen_context_node ) { - return false; - } else { - return $this->next_token(); - } + $is_pop = WP_HTML_Stack_Event::POP === $this->current_element->operation; + + /* + * The root node only exists in the fragment parser, and closing it + * indicates that the parse is complete. Stop before popping if from + * the breadcrumbs. + */ + if ( 'root-node' === $this->current_element->token->bookmark_name ) { + return ! $is_pop && $this->next_token(); } - if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) { - $this->element_queue = array(); - $this->current_element = null; - return false; + // Adjust the breadcrumbs for this event. + if ( $is_pop ) { + array_pop( $this->breadcrumbs ); + } else { + $this->breadcrumbs[] = $this->current_element->token->node_name; } // Avoid sending close events for elements which don't expect a closing. - if ( - WP_HTML_Stack_Event::POP === $this->current_element->operation && - ! static::expects_closer( $this->current_element->token ) - ) { + if ( $is_pop && ! static::expects_closer( $this->current_element->token ) ) { return $this->next_token(); } @@ -643,10 +654,11 @@ public function matches_breadcrumbs( $breadcrumbs ) { return false; } - foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + for ( $i = count( $this->breadcrumbs ) - 1; $i >= 0; $i-- ) { + $node = $this->breadcrumbs[ $i ]; $crumb = strtoupper( current( $breadcrumbs ) ); - if ( '*' !== $crumb && $node->node_name !== $crumb ) { + if ( '*' !== $crumb && $node !== $crumb ) { return false; } @@ -862,46 +874,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - $breadcrumbs = array(); - - foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { - $breadcrumbs[] = $stack_item->node_name; - } - - if ( ! $this->is_virtual() ) { - return $breadcrumbs; - } - - foreach ( $this->element_queue as $queue_item ) { - if ( $this->current_element->token->bookmark_name === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'context-node' === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'real' === $queue_item->provenance ) { - break; - } - - if ( WP_HTML_Stack_Event::PUSH === $queue_item->operation ) { - $breadcrumbs[] = $queue_item->token->node_name; - } else { - array_pop( $breadcrumbs ); - } - } - - if ( null !== parent::get_token_name() && ! parent::is_tag_closer() ) { - array_pop( $breadcrumbs ); - } - - // Add the virtual node we're at. - if ( WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $breadcrumbs[] = $this->current_element->token->node_name; - } - - return $breadcrumbs; + return $this->breadcrumbs; } /** @@ -930,9 +903,7 @@ public function get_breadcrumbs() { * @return int Nesting-depth of current location in the document. */ public function get_current_depth() { - return $this->is_virtual() - ? count( $this->get_breadcrumbs() ) - : $this->state->stack_of_open_elements->count(); + return count( $this->breadcrumbs ); } /** @@ -2552,7 +2523,6 @@ public function seek( $bookmark_name ) { ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start : 0; $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; - $bookmark_length = $this->bookmarks[ $actual_bookmark_name ]->length; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; /* @@ -2610,6 +2580,12 @@ public function seek( $bookmark_name ) { $this->state->frameset_ok = true; $this->element_queue = array(); $this->current_element = null; + + if ( isset( $this->context_node ) ) { + $this->breadcrumbs = array_slice( $this->breadcrumbs, 0, 2 ); + } else { + $this->breadcrumbs = array(); + } } // When moving forwards, reparse the document until reaching the same location as the original bookmark. diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php index 717276935a780..adce614506429 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php @@ -387,7 +387,16 @@ public function test_in_body_any_other_end_tag_with_unclosed_non_special_element $this->assertSame( 'CODE', $processor->get_tag(), "Expected to start test on CODE element but found {$processor->get_tag()} instead." ); $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'SPAN', 'CODE' ), $processor->get_breadcrumbs(), 'Failed to produce expected DOM nesting.' ); - $this->assertTrue( $processor->next_token(), 'Failed to advance past CODE tag to expected SPAN closer.' ); + $this->assertTrue( + $processor->next_tag( + array( + 'tag_name' => 'SPAN', + 'tag_closers' => 'visit', + ) + ), + 'Failed to advance past CODE tag to expected SPAN closer.' + ); + $this->assertSame( 'SPAN', $processor->get_tag() ); $this->assertTrue( $processor->is_tag_closer(), 'Expected to find closing SPAN, but found opener instead.' ); $this->assertSame( array( 'HTML', 'BODY', 'DIV' ), $processor->get_breadcrumbs(), 'Failed to advance past CODE tag to expected DIV opener.' ); From 6962fa29654f9a85d5c3136cd5ca458dceb8c5ad Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 16:25:15 -0700 Subject: [PATCH 05/11] HTML API: Expand Unsupported class and make it available for debugging. The HTML Processor internally throws an exception when it reaches HTML that it knows it cannot process, but this exception is not made available to calling code. It can be useful to extract more knowledge about why it gave up, especially for debugging purposes. In this patch, more context is added to the WP_HTML_Unsupported_Exception and the last exception is made available to calling code, if it asks. --- .../html-api/class-wp-html-processor.php | 65 +++++++++++++++++++ .../class-wp-html-unsupported-exception.php | 18 ++++- 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8c75bd3f06b21..f3be6c67064b9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -188,6 +188,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $last_error = null; + /** + * Stores context for why the parser bailed on unsupported HTML, if it did. + * + * @see self::get_unsupported_exception + * + * @since 6.7.0 + * + * @var WP_HTML_Unsupported_Exception|null + */ + private $unsupported_exception = null; + /** * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * @@ -384,6 +395,45 @@ function ( WP_HTML_Token $token ) { }; } + /** + * Stops the parser and terminates its execution when encountering unsupported markup. + * + * @throws WP_HTML_Unsupported_Exception Halts execution of the parser. + * + * @since 6.7.0 + * + * @param string $message Explains support is missing in order to parse the current node. + * + * @return mixed + */ + private function bail( string $message ) { + $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + $token = substr( $this->html, $here->start, $here->length ); + + $open_elements = array(); + foreach ( $this->state->stack_of_open_elements->stack as $item ) { + $open_elements[] = $item->node_name; + } + + $active_formats = array(); + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + $active_formats[] = $item->node_name; + } + + $this->last_error = self::ERROR_UNSUPPORTED; + + $this->unsupported_exception = new WP_HTML_Unsupported_Exception( + $message, + $this->state->current_token->node_name, + $here->start, + $token, + $open_elements, + $active_formats + ); + + throw $this->unsupported_exception; + } + /** * Returns the last error, if any. * @@ -411,6 +461,21 @@ public function get_last_error() { return $this->last_error; } + /** + * Returns context for why the parser aborted due to unsupported HTML, if it did. + * + * This is meant for debugging purposes, not for production use. + * + * @since 6.7.0 + * + * @see self::$unsupported_exception + * + * @return WP_HTML_Unsupported_Exception|null + */ + public function get_unsupported_exception() { + return $this->unsupported_exception; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php index 6e7228670bf8b..1a29714727623 100644 --- a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php +++ b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php @@ -1,4 +1,4 @@ -token_name = $token_name; + $this->token_at = $token_at; + $this->token = $token; + + $this->stack_of_open_elements = $stack_of_open_elements; + $this->active_formatting_elements = $active_formatting_elements; + } } From ab1096fdc53c48e503c274914020e1c11e4529b9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 16:10:15 -0700 Subject: [PATCH 06/11] HTML API: Implement "reconstruct the active formatting elements" algorithm. As part of work to add more spec support to the HTML API, this patch fills out the active format reconstruction algorithm so that more HTML can be supported in situations requiring that reconstruction, for example, when a formatting element such as an A tag or a CODE tag is implicitly closed. See Core-61576 --- ...ass-wp-html-active-formatting-elements.php | 32 +++++++++ .../html-api/class-wp-html-processor.php | 45 +++++++++++-- .../tests/html-api/wpHtmlProcessor.php | 15 +++-- .../html-api/wpHtmlProcessorBreadcrumbs.php | 66 +++++++++++-------- 4 files changed, 120 insertions(+), 38 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index 9f7fee9076243..e45e55e09dd28 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -43,6 +43,22 @@ class WP_HTML_Active_Formatting_Elements { */ private $stack = array(); + /** + * Returns the node at the given index in the list of active formatting elements. + * + * Do not use this method; it is meant to be used only by the HTML Processor. + * + * @since 6.7.0 + * + * @access private + * + * @param int $index Number of nodes from the top node to return. + * @return WP_HTML_Token|null Node at the given index in the stack, if one exists, otherwise null. + */ + public function at( $index ) { + return $this->stack[ $index ]; + } + /** * Reports if a specific node is in the stack of active formatting elements. * @@ -86,6 +102,22 @@ public function current_node() { return $current_node ? $current_node : null; } + /** + * Inserts a "marker" at the end of the list of active formatting elements. + * + * > The markers are inserted when entering applet, object, marquee, + * > template, td, th, and caption elements, and are used to prevent + * > formatting from "leaking" into applet, object, marquee, template, + * > td, th, and caption elements. + * + * @see https://html.spec.whatwg.org/#concept-parser-marker + * + * @since 6.7.0 + */ + public function insert_marker() { + $this->push( new WP_HTML_Token( null, 'marker', false ) ); + } + /** * Pushes a node onto the stack of active formatting elements. * diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f3be6c67064b9..aa285727b6b4c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -2864,7 +2864,7 @@ private function generate_implied_end_tags_thoroughly() { * > in the current body, cell, or caption (whichever is youngest) that haven't * > been explicitly closed. * - * @since 6.4.0 + * @since 6.7.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * @@ -2873,15 +2873,19 @@ private function generate_implied_end_tags_thoroughly() { * @return bool Whether any formatting elements needed to be reconstructed. */ private function reconstruct_active_formatting_elements() { + $count = $this->state->active_formatting_elements->count(); + /* * > If there are no entries in the list of active formatting elements, then there is nothing * > to reconstruct; stop this algorithm. */ - if ( 0 === $this->state->active_formatting_elements->count() ) { + if ( 0 === $count ) { return false; } - $last_entry = $this->state->active_formatting_elements->current_node(); + // Start at the last node in the list of active formatting elements. + $currently_at = $count - 1; + $last_entry = $this->state->active_formatting_elements->at( $currently_at ); if ( /* @@ -2900,8 +2904,39 @@ private function reconstruct_active_formatting_elements() { return false; } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + $entry = $last_entry; + + while ( $currently_at >= 0 ) { + if ( 0 === $currently_at ) { + goto create; + } + $entry = $this->state->active_formatting_elements->at( --$currently_at ); + + /* + * > If entry is neither a marker nor an element that is also in the stack of open elements, + * > go to the step labeled rewind. + */ + if ( 'marker' === $entry->node_name || $this->state->stack_of_open_elements->contains_node( $entry ) ) { + break; + } + } + + advance: + $entry = $this->state->active_formatting_elements->at( ++$currently_at ); + + create: + $this->insert_html_element( $entry ); + + /* + * > Replace the entry for entry in the list with an entry for new element. + * This doesn't need to happen here since no DOM is being created. + */ + + if ( $count - 1 !== $currently_at ) { + goto advance; + } + + return true; } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index b842703a7a135..8294aa5b198b3 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -112,18 +112,23 @@ public function test_clear_to_navigate_after_seeking() { } /** - * Ensures that support is added for reconstructing active formatting elements - * before the HTML Processor handles situations with unclosed formats requiring it. + * Ensures that support is added for reconstructing active formatting elements. * * @ticket 58517 * * @covers WP_HTML_Processor::reconstruct_active_formatting_elements */ - public function test_fails_to_reconstruct_formatting_elements() { - $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); + public function test_reconstructs_formatting_elements() { + $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find first EM.' ); - $this->assertFalse( $processor->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' ); + $this->assertSame( array( 'HTML', 'BODY', 'P', 'EM' ), $processor->get_breadcrumbs(), 'Found incorrect breadcrumbs for first EM.' ); + $this->assertTrue( $processor->next_tag( 'SPAN' ), 'Could not find test span.' ); + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'EM', 'EM', 'SPAN' ), + $processor->get_breadcrumbs(), + 'Found incorrect breadcrumbs for test SPAN; should have created two EMs.' + ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index 403f40a1da032..5ec846df16fec 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -219,45 +219,55 @@ public static function data_unsupported_elements() { } /** - * @ticket 58517 - * - * @dataProvider data_unsupported_markup + * Ensures that formats inside unclosed A elements are reconstructed. * - * @param string $html HTML containing unsupported markup. + * @ticket 61576 */ - public function test_fails_when_encountering_unsupported_markup( $html, $description ) { - $processor = WP_HTML_Processor::create_fragment( $html ); - - while ( $processor->next_token() && null === $processor->get_attribute( 'supported' ) ) { - continue; - } + public function test_reconstructs_formatting_from_unclosed_a_elements() { + $processor = WP_HTML_Processor::create_fragment( 'Click Here' ); - $this->assertNull( - $processor->get_last_error(), - 'Bailed on unsupported input before finding supported checkpoint: check test code.' + $processor->next_tag( 'STRONG' ); + $this->assertSame( + array( 'HTML', 'BODY', 'A', 'STRONG' ), + $processor->get_breadcrumbs(), + 'Failed to construct starting breadcrumbs properly.' ); - $this->assertTrue( $processor->get_attribute( 'supported' ), 'Did not find required supported element.' ); - $processor->next_token(); - $this->assertNotNull( $processor->get_last_error(), "Didn't properly reject unsupported markup: {$description}" ); + $processor->next_tag( 'BIG' ); + $this->assertSame( + array( 'HTML', 'BODY', 'STRONG', 'A', 'BIG' ), + $processor->get_breadcrumbs(), + 'Failed to reconstruct the active formatting elements after an unclosed A element.' + ); } /** - * Data provider. + * Ensures that unclosed A elements are reconstructed. * - * @return array[] + * @ticket 61576 */ - public static function data_unsupported_markup() { - return array( - 'A with formatting following unclosed A' => array( - 'Click Here', - 'Unclosed formatting requires complicated reconstruction.', - ), + public function test_reconstructs_unclosed_a_elements() { + $processor = WP_HTML_Processor::create_fragment( '

' ); - 'A after unclosed A inside DIV' => array( - '
', - 'A is a formatting element, which requires more complicated reconstruction.', - ), + $processor->next_tag( 'DIV' ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV' ), + $processor->get_breadcrumbs(), + 'Failed to construct breadcrumbs properly - the DIV should have closed the A element.' + ); + + // When the DIV re-opens, it reconstructs an unclosed A, then the A in the text is a second A. + $processor->next_tag( 'A' ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV', 'A' ), + 'Failed to create proper breadcrumbs for recreated A element.' + ); + + // This is the one that's second in the raw text. + $processor->next_tag( 'A' ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV', 'A' ), + 'Failed to create proper breadcrumbs for explicit A element - this A should have closed the reconstructed A.' ); } From 22856d320784a0c6b6ca751f08acb42113f2b26e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 8 Aug 2024 20:10:57 -0700 Subject: [PATCH 07/11] More iteration --- .../html-api/class-wp-html-processor.php | 166 ++++++++++++++---- 1 file changed, 127 insertions(+), 39 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 4736c958ed280..10cc72125d2a9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -2360,7 +2360,11 @@ private function step_in_body(): bool { break; case 'A': - $this->run_adoption_agency_algorithm(); + switch ( $this->run_adoption_agency_algorithm() ) { + case 'any-other-end-tag': + goto in_body_any_other_end_tag; + break; + } $this->state->active_formatting_elements->remove_node( $item ); $this->state->stack_of_open_elements->remove_node( $item ); break; @@ -2401,7 +2405,11 @@ private function step_in_body(): bool { if ( $this->state->stack_of_open_elements->has_element_in_scope( 'NOBR' ) ) { // Parse error. - $this->run_adoption_agency_algorithm(); + switch ( $this->run_adoption_agency_algorithm() ) { + case 'any-other-end-tag': + goto in_body_any_other_end_tag; + break; + } $this->reconstruct_active_formatting_elements(); } @@ -2426,7 +2434,11 @@ private function step_in_body(): bool { case '-STRONG': case '-TT': case '-U': - $this->run_adoption_agency_algorithm(); + switch ( $this->run_adoption_agency_algorithm() ) { + case 'any-other-end-tag': + goto in_body_any_other_end_tag; + break; + } return true; /* @@ -2762,6 +2774,7 @@ private function step_in_body(): bool { /* * > Any other end tag */ + in_body_any_other_end_tag: /* * Find the corresponding tag opener in the stack of open elements, if @@ -5311,28 +5324,34 @@ public function reset_insertion_mode(): void { * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm + * + * @return string|null Indicates if the calling code should follow up with any actions, + * such as `any-other-end-tag`, otherwise `NULL`. */ - private function run_adoption_agency_algorithm(): void { + private function run_adoption_agency_algorithm(): ?string { $budget = 1000; $subject = $this->get_tag(); $current_node = $this->state->stack_of_open_elements->current_node(); + /* + * > 2. If the current node is an HTML element whose tag name is subject, + * > and the current node is not in the list of active formatting elements, + * > then pop the current node off the stack of open elements and return. + */ if ( - // > If the current node is an HTML element whose tag name is subject - isset( $current_node ) && $subject === $current_node->node_name && - // > the current node is not in the list of active formatting elements + $this->state->stack_of_open_elements->current_node_is( $subject ) && ! $this->state->active_formatting_elements->contains_node( $current_node ) ) { $this->state->stack_of_open_elements->pop(); - return; + return null; } - for ( $outer_loop_counter = 0; $outer_loop_counter < 8; $outer_loop_counter++ ) { + for ( $outer_loop_counter = 0; $outer_loop_counter < 8; ++$outer_loop_counter ) { /* - * > Let formatting element be the last element in the list of active formatting elements that: - * > - is between the end of the list and the last marker in the list, - * > if any, or the start of the list otherwise, - * > - and has the tag name subject. + * > 3. Let formatting element be the last element in the list of active formatting elements that: + * > - is between the end of the list and the last marker in the list, + * > if any, or the start of the list otherwise, + * > - and has the tag name subject. */ $formatting_element = null; foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { @@ -5346,30 +5365,38 @@ private function run_adoption_agency_algorithm(): void { } } - // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. + /* + * > If there is no such element, then return and instead act as + * > described in the "any other end tag" entry above. + */ if ( null === $formatting_element ) { - $this->last_error = self::ERROR_UNSUPPORTED; - $this->bail( 'Cannot run adoption agency when "any other end tag" is required.' ); + return 'any-other-end-tag'; } - // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. + /* + * > 4. If formatting element is not in the stack of open elements, then + * > this is a parse error; remove the element from the list, and return. + */ if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); - return; + return null; } - // > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return. + /* + * > 5. If formatting element is in the stack of open elements, but the element + * > is not in scope, then this is a parse error; return. + */ if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) { - return; + return null; } /* - * > If formatting element is not the current node, this is a parse error. (But do not return.) + * > 6. If formatting element is not the current node, this is a parse error. (But do not return.) */ /* - * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack - * > than formatting element, and is an element in the special category. There might not be one. + * > 7. Let furthest block be the topmost node in the stack of open elements that is lower in the stack + * > than formatting element, and is an element in the special category. There might not be one. */ $furthest_block = null; foreach ( $this->state->stack_of_open_elements->walk_down( $formatting_element ) as $item ) { @@ -5380,9 +5407,9 @@ private function run_adoption_agency_algorithm(): void { } /* - * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the - * > stack of open elements, from the current node up to and including formatting element, then - * > remove formatting element from the list of active formatting elements, and finally return. + * > 8. If there is no furthest block, then the UA must first pop all the nodes from the bottom of + * > the stack of open elements, from the current node up to and including formatting element, + * > then remove formatting element from the list of active formatting elements, and finally return. */ if ( null === $furthest_block ) { foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { @@ -5390,13 +5417,14 @@ private function run_adoption_agency_algorithm(): void { if ( $formatting_element === $item ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); - return; + return null; } } } /* - * > Let common ancestor be the element immediately above formatting element in the stack of open elements. + * > 9. Let common ancestor be the element immediately above + * > formatting element in the stack of open elements. */ $common_ancestor = null; foreach ( $this->state->stack_of_open_elements->walk_up( $formatting_element ) as $item ) { @@ -5405,7 +5433,8 @@ private function run_adoption_agency_algorithm(): void { } /* - * Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list. + * > 10. Let a bookmark note the position of formatting element in the list of active + * > formatting elements relative to the elements on either side of it in the list. */ $formatting_element_index = 0; foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { @@ -5417,15 +5446,24 @@ private function run_adoption_agency_algorithm(): void { } /* - * > Let node and last node be furthest block. + * > 11. Let node and last node be furthest block. */ $node = $furthest_block; $last_node = $furthest_block; $inner_loop_counter = 0; while ( $budget-- > 0 ) { + /* + * > 1. Increment innerLoopCounter by 1. + */ ++$inner_loop_counter; + /* + * > 2. Let node be the element immediately above node in the stack of open elements, + * > or if node is no longer in the stack of open elements (e.g. because it got + * > removed by this algorithm), the element that was immediately above node in + * > the stack of open elements before node was removed. + */ if ( $this->state->stack_of_open_elements->contains_node( $node ) ) { foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { $node = $item; @@ -5435,36 +5473,86 @@ private function run_adoption_agency_algorithm(): void { $this->bail( 'Cannot adjust node pointer above removed node.' ); } + /* + * > 3. If node is formattingElement, the break. + */ if ( $formatting_element === $node ) { break; } + /* + * > 4. If innerLoopCounter is greater than 3 and node is in the list of active formatting + * > elements, then remove node from the list of active formatting elements. + */ if ( $inner_loop_counter > 3 && $this->state->active_formatting_elements->contains_node( $node ) ) { $this->state->active_formatting_elements->remove_node( $node ); } + /* + * > 5. If node is not in the list of active formatting elements, then remove node from + * > the stack of open elements and continue. + */ if ( ! $this->state->active_formatting_elements->contains_node( $node ) ) { $this->state->stack_of_open_elements->remove_node( $node ); continue; } /* - * > Create an element for the token for which the element node was created, - * in the HTML namespace, with common ancestor as the intended parent; - * replace the entry for node in the list of active formatting elements - * with an entry for the new element, replace the entry for node in the - * stack of open elements with an entry for the new element, and let node - * be the new element. + * > 6. Create an element for the token for which the element node was created, + * > in the HTML namespace, with common ancestor as the intended parent; + * > replace the entry for node in the list of active formatting elements + * > with an entry for the new element, replace the entry for node in the + * > stack of open elements with an entry for the new element, and let node + * > be the new element. */ $this->bail( 'Cannot create and reference new element for which no token exists.' ); + + /* + * > 7. If last node is furthestBlock, then move the aforementioned bookmark to + * > be immediately after the new node in the list of active formatting elements. + */ + + /* + * > 8. Append lastNode to node. + */ + + /* + * > 9. Set lastNode to node. + */ + $last_node = $node; } /* - * > Insert whatever last node ended up being in the previous step at the appropriate - * > palce for inserting a node, but using common ancestor as the override target. + * > 14. Insert whatever last node ended up being in the previous step at the appropriate + * > place for inserting a node, but using common ancestor as the override target. */ - $this->bail( 'Cannot create and reference new element for which no token exists.' ); + + /* + * > 15. Create an element for the token for which formattingElement was created, + * > in the HTML namespace, with furthestBlock as the intended parent. + */ + + /* + * > 16. Take all of the child nodes of furthestBlock and append them to the element + * > created in the last step. + */ + + /* + * > 17. Append that new element to furthestBlock. + */ + + /* + * > 18. Remove formattingElement from the list of active formatting elements, + * > and insert the new element into the list of active formatting elements + * > at the position of the aforementioned bookmark. + */ + + /* + * > 19. Remove formattingElement from the stack of open elements, and insert the + * > new element into the stack of open elements immediately below the position + * > of furthestBlock in that stack. + */ } } From 398ce3797952c3b5f2f79eed36d78e722241dc65 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 9 Aug 2024 00:12:47 -0700 Subject: [PATCH 08/11] Bail when needing to ignore token during adoption. --- .../html-api/class-wp-html-processor.php | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 10cc72125d2a9..6b08f20d089c5 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -2360,10 +2360,8 @@ private function step_in_body(): bool { break; case 'A': - switch ( $this->run_adoption_agency_algorithm() ) { - case 'any-other-end-tag': - goto in_body_any_other_end_tag; - break; + if ( 'ignore' === $this->run_adoption_agency_algorithm() ) { + $this->bail( 'Cannot ignore token after running adoption agency algorithm.' ); } $this->state->active_formatting_elements->remove_node( $item ); $this->state->stack_of_open_elements->remove_node( $item ); @@ -2405,10 +2403,8 @@ private function step_in_body(): bool { if ( $this->state->stack_of_open_elements->has_element_in_scope( 'NOBR' ) ) { // Parse error. - switch ( $this->run_adoption_agency_algorithm() ) { - case 'any-other-end-tag': - goto in_body_any_other_end_tag; - break; + if ( 'ignore' === $this->run_adoption_agency_algorithm() ) { + $this->bail( 'Cannot ignore token after running adoption agency algorithm.' ); } $this->reconstruct_active_formatting_elements(); } @@ -2434,10 +2430,8 @@ private function step_in_body(): bool { case '-STRONG': case '-TT': case '-U': - switch ( $this->run_adoption_agency_algorithm() ) { - case 'any-other-end-tag': - goto in_body_any_other_end_tag; - break; + if ( 'ignore' === $this->run_adoption_agency_algorithm() ) { + $this->bail( 'Cannot ignore token after running adoption agency algorithm.' ); } return true; @@ -2774,7 +2768,6 @@ private function step_in_body(): bool { /* * > Any other end tag */ - in_body_any_other_end_tag: /* * Find the corresponding tag opener in the stack of open elements, if @@ -5324,9 +5317,6 @@ public function reset_insertion_mode(): void { * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm - * - * @return string|null Indicates if the calling code should follow up with any actions, - * such as `any-other-end-tag`, otherwise `NULL`. */ private function run_adoption_agency_algorithm(): ?string { $budget = 1000; @@ -5339,7 +5329,8 @@ private function run_adoption_agency_algorithm(): ?string { * > then pop the current node off the stack of open elements and return. */ if ( - $this->state->stack_of_open_elements->current_node_is( $subject ) && + 'html' === $current_node->namespace && + $subject === $current_node->node_name && ! $this->state->active_formatting_elements->contains_node( $current_node ) ) { $this->state->stack_of_open_elements->pop(); @@ -5370,7 +5361,28 @@ private function run_adoption_agency_algorithm(): ?string { * > described in the "any other end tag" entry above. */ if ( null === $formatting_element ) { - return 'any-other-end-tag'; + /* + * These steps are copied here from above. This may remove the node + * or ignore it, meaning the following code must respect that. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + if ( 'html' === $node->namespace && $subject === $node->node_name ) { + break; + } + + if ( self::is_special( $node ) ) { + return 'ignore'; + } + } + + $this->generate_implied_end_tags( $subject ); + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + + if ( $node === $item ) { + return null; + } + } } /* From 374cba3ae0920ba05426acb5c5ea7d3b4396da2b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 9 Aug 2024 00:33:04 -0700 Subject: [PATCH 09/11] Continue iterating --- ...ass-wp-html-active-formatting-elements.php | 6 +- .../html-api/class-wp-html-processor.php | 80 ++++++++++++------- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index b5547c4ff98f3..2428f9a88e8f1 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -44,7 +44,7 @@ class WP_HTML_Active_Formatting_Elements { private $stack = array(); /** - * Returns the node at the given index in the list of active formatting elements. + * Returns the node at the given 1-offset index in the list of active formatting elements. * * Do not use this method; it is meant to be used only by the HTML Processor. * @@ -55,8 +55,8 @@ class WP_HTML_Active_Formatting_Elements { * @param int $index Number of nodes from the top node to return. * @return WP_HTML_Token|null Node at the given index in the stack, if one exists, otherwise null. */ - public function at( $index ) { - return $this->stack[ $index ]; + public function at( $nth ) { + return $this->stack[ $nth - 1 ]; } /** diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 24e3d01340c0b..cadd86b8aa240 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5093,63 +5093,83 @@ private function get_adjusted_current_node(): ?WP_HTML_Token { private function reconstruct_active_formatting_elements(): bool { $count = $this->state->active_formatting_elements->count(); /* - * > If there are no entries in the list of active formatting elements, then there is nothing - * > to reconstruct; stop this algorithm. + * > 1. If there are no entries in the list of active formatting elements, + * > then there is nothing to reconstruct; stop this algorithm. */ if ( 0 === $count ) { return false; } - // Start at the last node in the list of active formatting elements. - $currently_at = $count - 1; + $currently_at = $count; $last_entry = $this->state->active_formatting_elements->at( $currently_at ); + /* + * > 2. If the last (most recently added) entry in the list of active formatting + * > elements is a marker, or if it is an element that is in the stack of open + * > elements, then there is nothing to reconstruct; stop this algorithm. + */ if ( - - /* - * > If the last (most recently added) entry in the list of active formatting elements is a marker; - * > stop this algorithm. - */ 'marker' === $last_entry->node_name || - - /* - * > If the last (most recently added) entry in the list of active formatting elements is an - * > element that is in the stack of open elements, then there is nothing to reconstruct; - * > stop this algorithm. - */ $this->state->stack_of_open_elements->contains_node( $last_entry ) ) { return false; } + /* + * > 3. Let entry be the last (most recently added) element + * > in the list of active formatting elements. + */ $entry = $last_entry; - while ( $currently_at >= 0 ) { - if ( 0 === $currently_at ) { - goto create; - } - $entry = $this->state->active_formatting_elements->at( --$currently_at ); + /* + * > 4. Rewind: If there are no entries before entry in the list of active + * > formatting elements, then jump to the step labeled create. + */ + rewind: + if ( 1 === $currently_at ) { + goto create; + } - /* - * > If entry is neither a marker nor an element that is also in the stack of open elements, - * > go to the step labeled rewind. - */ - if ( 'marker' === $entry->node_name || $this->state->stack_of_open_elements->contains_node( $entry ) ) { - break; - } + /* + * > 5. Let entry be the entry one earlier than entry + * > in the list of active formatting elements. + */ + $entry = $this->state->active_formatting_elements->at( --$currently_at ); + + /* + * > 6. If entry is neither a marker nor an element that is also in + * > the stack of open elements, go to the step labeled rewind. + */ + if ( + 'marker' !== $entry->node_name && + ! $this->state->stack_of_open_elements->contains_node( $entry ) + ) { + goto rewind; } + /* + * > 7. Advance: Let entry be the element one later than entry + * > in the list of active formatting elements. + */ advance: $entry = $this->state->active_formatting_elements->at( ++$currently_at ); + /* + * > 8. Create: Insert an HTML element for the token for which the + * > element entry was created, to obtain new element. + */ create: $this->insert_html_element( $entry ); /* - * > Replace the entry for entry in the list with an entry for new element. - * This doesn't need to happen here since no DOM is being created. + * > 9. Replace the entry for entry in the list with an entry for new element. + * > This doesn't need to happen here since no DOM is being created. */ - if ( $count - 1 !== $currently_at ) { + /* + * > 10. If the entry for new element in the list of active formatting elements + * > is not the last entry in the list, return to the step labeled advance. + */ + if ( $count !== $currently_at ) { goto advance; } From d7dac5e0b3f96358864b165f2de1f9a582e39812 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 2 Sep 2024 11:36:54 -0500 Subject: [PATCH 10/11] Goto instead of ignore --- .../html-api/class-wp-html-processor.php | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6b08f20d089c5..7c8f7b6cfe17a 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -2357,15 +2357,15 @@ private function step_in_body(): bool { foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { switch ( $item->node_name ) { case 'marker': - break; + break 2; case 'A': - if ( 'ignore' === $this->run_adoption_agency_algorithm() ) { - $this->bail( 'Cannot ignore token after running adoption agency algorithm.' ); + if ( 'any-other-end-tag' === $this->run_adoption_agency_algorithm() ) { + goto in_body_any_other_end_tag; } $this->state->active_formatting_elements->remove_node( $item ); $this->state->stack_of_open_elements->remove_node( $item ); - break; + break 2; } } @@ -2403,8 +2403,8 @@ private function step_in_body(): bool { if ( $this->state->stack_of_open_elements->has_element_in_scope( 'NOBR' ) ) { // Parse error. - if ( 'ignore' === $this->run_adoption_agency_algorithm() ) { - $this->bail( 'Cannot ignore token after running adoption agency algorithm.' ); + if ( 'any-other-end-tag' === $this->run_adoption_agency_algorithm() ) { + goto in_body_any_other_end_tag; } $this->reconstruct_active_formatting_elements(); } @@ -2424,14 +2424,15 @@ private function step_in_body(): bool { case '-EM': case '-FONT': case '-I': + case '-NOBR': case '-S': case '-SMALL': case '-STRIKE': case '-STRONG': case '-TT': case '-U': - if ( 'ignore' === $this->run_adoption_agency_algorithm() ) { - $this->bail( 'Cannot ignore token after running adoption agency algorithm.' ); + if ( 'any-other-end-tag' === $this->run_adoption_agency_algorithm() ) { + goto in_body_any_other_end_tag; } return true; @@ -2768,6 +2769,7 @@ private function step_in_body(): bool { /* * > Any other end tag */ + in_body_any_other_end_tag: /* * Find the corresponding tag opener in the stack of open elements, if @@ -5361,6 +5363,7 @@ private function run_adoption_agency_algorithm(): ?string { * > described in the "any other end tag" entry above. */ if ( null === $formatting_element ) { + return 'any-other-end-tag'; /* * These steps are copied here from above. This may remove the node * or ignore it, meaning the following code must respect that. From cabe36ea0d362f734fa8e4f181455b415977ed55 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 2 Sep 2024 11:37:44 -0500 Subject: [PATCH 11/11] Remove dead code --- .../html-api/class-wp-html-processor.php | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 7c8f7b6cfe17a..feeaec5058549 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5364,28 +5364,6 @@ private function run_adoption_agency_algorithm(): ?string { */ if ( null === $formatting_element ) { return 'any-other-end-tag'; - /* - * These steps are copied here from above. This may remove the node - * or ignore it, meaning the following code must respect that. - */ - foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { - if ( 'html' === $node->namespace && $subject === $node->node_name ) { - break; - } - - if ( self::is_special( $node ) ) { - return 'ignore'; - } - } - - $this->generate_implied_end_tags( $subject ); - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - $this->state->stack_of_open_elements->pop(); - - if ( $node === $item ) { - return null; - } - } } /*