Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 101 additions & 46 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,10 @@ public function get_last_error() {
public function next_tag( $query = null ) {
if ( null === $query ) {
while ( $this->step() ) {
if ( '#tag' !== $this->get_token_type() ) {
continue;
}

if ( ! $this->is_tag_closer() ) {
return true;
}
Expand All @@ -384,6 +388,10 @@ public function next_tag( $query = null ) {

if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
while ( $this->step() ) {
if ( '#tag' !== $this->get_token_type() ) {
continue;
}

if ( ! $this->is_tag_closer() ) {
return true;
}
Expand All @@ -405,6 +413,10 @@ public function next_tag( $query = null ) {
$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;

while ( $match_offset > 0 && $this->step() ) {
if ( '#tag' !== $this->get_token_type() ) {
continue;
}

if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
return true;
}
Expand All @@ -428,13 +440,7 @@ public function next_tag( $query = null ) {
* @return bool
*/
public function next_token() {
$found_a_token = parent::next_token();

if ( '#tag' === $this->get_token_type() ) {
$this->step( self::PROCESS_CURRENT_NODE );
}

return $found_a_token;
return $this->step();
}

/**
Expand Down Expand Up @@ -463,10 +469,6 @@ public function next_token() {
* @return bool Whether the currently-matched tag is found at the given nested structure.
*/
public function matches_breadcrumbs( $breadcrumbs ) {
if ( ! $this->get_tag() ) {
return false;
}

// Everything matches when there are zero constraints.
if ( 0 === count( $breadcrumbs ) ) {
return true;
Expand Down Expand Up @@ -529,25 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
* is provided in the opening tag, otherwise it expects a tag closer.
*/
$top_node = $this->state->stack_of_open_elements->current_node();
if ( $top_node && self::is_void( $top_node->node_name ) ) {
if (
$top_node && (
// Void elements.
self::is_void( $top_node->node_name ) ||
// Comments, text nodes, and other atomic tokens.
'#' === $top_node->node_name[0] ||
// Doctype declarations.
'html' === $top_node->node_name
)
) {
$this->state->stack_of_open_elements->pop();
}
}

if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
while ( parent::next_token() && '#tag' !== $this->get_token_type() ) {
continue;
}
parent::next_token();
}

// Finish stepping when there are no more tokens in the document.
if ( null === $this->get_tag() ) {
if (
WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
) {
return false;
}

$this->state->current_token = new WP_HTML_Token(
$this->bookmark_tag(),
$this->get_tag(),
$this->bookmark_token(),
$this->get_token_name(),
$this->has_self_closing_flag(),
$this->release_internal_bookmark_on_destruct
);
Expand Down Expand Up @@ -591,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
*/
public function get_breadcrumbs() {
if ( ! $this->get_tag() ) {
return null;
}

$breadcrumbs = array();
foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
$breadcrumbs[] = $stack_item->node_name;
Expand All @@ -619,11 +627,61 @@ public function get_breadcrumbs() {
* @return bool Whether an element was found.
*/
private function step_in_body() {
$tag_name = $this->get_tag();
$op_sigil = $this->is_tag_closer() ? '-' : '+';
$op = "{$op_sigil}{$tag_name}";
$token_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$token_name}";

switch ( $op ) {
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_html_element( $this->state->current_token );
return true;

case '#text':
$this->reconstruct_active_formatting_elements();

$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];

/*
* > A character token that is U+0000 NULL
*
* Any successive sequence of NULL bytes is ignored and won't
* trigger active format reconstruction. Therefore, if the text
* only comprises NULL bytes then the token should be ignored
* here, but if there are any other characters in the stream
* the active formats should be reconstructed.
*/
if (
1 <= $current_token->length &&
"\x00" === $this->html[ $current_token->start ] &&
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
) {
// Parse error: ignore the token.
return $this->step();
}

/*
* Whitespace-only text does not affect the frameset-ok flag.
* It is probably inter-element whitespace, but it may also
* contain character references which decode only to whitespace.
*/
$text = $this->get_modifiable_text();
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
$this->state->frameset_ok = false;
}

$this->insert_html_element( $this->state->current_token );
return true;

case 'html':
/*
* > A DOCTYPE token
* > Parse error. Ignore the token.
*/
return $this->step();

/*
* > A start tag whose tag name is "button"
*/
Expand Down Expand Up @@ -711,17 +769,17 @@ private function step_in_body() {
case '-SECTION':
case '-SUMMARY':
case '-UL':
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
// @todo Report parse error.
// Ignore the token.
return $this->step();
}

$this->generate_implied_end_tags();
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
// @todo Record parse error: this error doesn't impact parsing.
}
$this->state->stack_of_open_elements->pop_until( $tag_name );
$this->state->stack_of_open_elements->pop_until( $token_name );
return true;

/*
Expand Down Expand Up @@ -783,7 +841,7 @@ private function step_in_body() {

$this->generate_implied_end_tags();

if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
// @todo Record parse error: this error doesn't impact parsing.
}

Expand All @@ -799,7 +857,7 @@ private function step_in_body() {
case '+LI':
$this->state->frameset_ok = false;
$node = $this->state->stack_of_open_elements->current_node();
$is_li = 'LI' === $tag_name;
$is_li = 'LI' === $token_name;

in_body_list_loop:
/*
Expand Down Expand Up @@ -862,7 +920,7 @@ private function step_in_body() {
* then this is a parse error; ignore the token.
*/
(
'LI' === $tag_name &&
'LI' === $token_name &&
! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
) ||
/*
Expand All @@ -872,8 +930,8 @@ private function step_in_body() {
* parse error; ignore the token.
*/
(
'LI' !== $tag_name &&
! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name )
'LI' !== $token_name &&
! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
)
) {
/*
Expand All @@ -884,13 +942,13 @@ private function step_in_body() {
return $this->step();
}

$this->generate_implied_end_tags( $tag_name );
$this->generate_implied_end_tags( $token_name );

if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
}

$this->state->stack_of_open_elements->pop_until( $tag_name );
$this->state->stack_of_open_elements->pop_until( $token_name );
return true;

/*
Expand Down Expand Up @@ -1043,7 +1101,7 @@ private function step_in_body() {
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
*/
switch ( $tag_name ) {
switch ( $token_name ) {
case 'APPLET':
case 'BASE':
case 'BASEFONT':
Expand Down Expand Up @@ -1091,7 +1149,7 @@ private function step_in_body() {
case 'TR':
case 'XMP':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
}

if ( ! $this->is_tag_closer() ) {
Expand All @@ -1113,7 +1171,7 @@ private function step_in_body() {
* close anything beyond its containing `P` or `DIV` element.
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
if ( $tag_name === $node->node_name ) {
if ( $token_name === $node->node_name ) {
break;
}

Expand All @@ -1123,7 +1181,7 @@ private function step_in_body() {
}
}

$this->generate_implied_end_tags( $tag_name );
$this->generate_implied_end_tags( $token_name );
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
// @todo Record parse error: this error doesn't impact parsing.
}
Expand All @@ -1142,19 +1200,16 @@ private function step_in_body() {
*/

/**
* Creates a new bookmark for the currently-matched tag and returns the generated name.
* Creates a new bookmark for the currently-matched token and returns the generated name.
*
* @since 6.4.0
* @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
*
* @throws Exception When unable to allocate requested bookmark.
*
* @return string|false Name of created bookmark, or false if unable to create.
*/
private function bookmark_tag() {
if ( ! $this->get_tag() ) {
return false;
}

private function bookmark_token() {
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
throw new Exception( 'could not allocate bookmark' );
Expand Down
3 changes: 3 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ public function test_in_body_skips_unexpected_button_closer() {
$this->assertSame( 'DIV', $processor->get_tag(), 'Did not stop at initial DIV tag.' );
$this->assertFalse( $processor->is_tag_closer(), 'Did not find that initial DIV tag is an opener.' );

$processor->step();
$this->assertSame( '#text', $processor->get_token_type(), 'Should have found the text node.' );

/*
* When encountering the BUTTON closing tag, there is no BUTTON in the stack of open elements.
* It should be ignored as there's no BUTTON to close.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
<?php
/**
* Unit tests for the HTML API ensuring proper handling of behaviors related to
* active format reconstruction.
*
* @package WordPress
* @subpackage HTML-API
*
* @since 6.5.0
*
* @group html-api
*
* @coversDefaultClass WP_HTML_Processor
*/
class Tests_HtmlApi_WpHtmlSupportRequiredActiveFormatReconstruction extends WP_UnitTestCase {
/**
* Ensures that active formats are properly reconstructed when visiting text nodes,
* verifying that the proper breadcrumbs are maintained when scanning through HTML.
*
* @ticket 60455
*/
public function test_reconstructs_active_formats_on_text_nodes() {
$processor = WP_HTML_Processor::create_fragment( '<p><b>One<p><source>Two<source>' );

// The SOURCE element doesn't trigger reconstruction, and this test asserts that.
$this->assertTrue(
$processor->next_tag( 'SOURCE' ),
'Should have found the first custom element.'
);

$this->assertSame(
array( 'HTML', 'BODY', 'P', 'SOURCE' ),
$processor->get_breadcrumbs(),
'Should have closed formatting element at first P element.'
);

/*
* There are two ways this test could fail. One is to appropriately find the
* second text node but fail to reconstruct the implicitly-closed B element.
* The other way is to fail to abort when encountering the second text node
* because the kind of active format reconstruction isn't supported.
*
* At the time of writing this test, the HTML Processor bails whenever it
* needs to reconstruct active formats, unless there are no active formats.
* To ensure that this test properly works once that support is expanded,
* it's written to verify both circumstances. Once support is added, this
* can be simplified to only contain the first clause of the conditional.
*
* The use of the SOURCE element is important here because most elements
* will also trigger reconstruction, which would conflate the test results
* with the text node triggering reconstruction. The SOURCE element won't
* do this, making it neutral. Therefore, the implicitly-closed B element
* will only be reconstructed by the text node.
*/

if ( $processor->next_tag( 'SOURCE' ) ) {
$this->assertSame(
array( 'HTML', 'BODY', 'P', 'B', 'SOURCE' ),
$processor->get_breadcrumbs(),
'Should have reconstructed the implicitly-closed B element.'
);
} else {
$this->assertSame(
WP_HTML_Processor::ERROR_UNSUPPORTED,
$processor->get_last_error(),
'Should have aborted for incomplete active format reconstruction when encountering the second text node.'
);
}
}
}