Skip to content
Closed
58 changes: 57 additions & 1 deletion src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,61 @@ function ( WP_HTML_Token $token ): void {
};
}

/**
* Creates a fragment processor with the current node as its context element.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
*
* @param string $html Input HTML fragment to process.
* @return static|null The created processor if successful, otherwise null.
*/
public function spawn_fragment_parser( string $html ): ?self {
if ( $this->get_token_type() !== '#tag' ) {
return null;
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably bail on closers too.

Maybe bail on virtual tokens?


$namespace = $this->get_namespace();

/*
* Prevent creating fragments at "self-contained" nodes.
*
* @see https://github.com/WordPress/wordpress-develop/pull/7141
* @see https://github.com/WordPress/wordpress-develop/pull/7198
*/
if (
'html' === $namespace &&
in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true )
) {
return null;
}

$fragment_processor = self::create_fragment( $html );
$fragment_processor->compat_mode = $this->compat_mode;

$fragment_processor->context_node = clone $this->state->current_token;
$fragment_processor->context_node->bookmark_name = 'context-node';
$fragment_processor->context_node->on_destroy = null;

$context_element = array( $fragment_processor->context_node->node_name, array() );
foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) {
$context_element[1][ $name ] = $value;
}

$fragment_processor->breadcrumbs = array();

if ( 'TEMPLATE' === $context_element[0] ) {
$fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE;
}

$fragment_processor->reset_insertion_mode_appropriately();

// @todo Set the parser's form element pointer.

$fragment_processor->state->encoding_confidence = 'irrelevant';

return $fragment_processor;
}

/**
* Stops the parser and terminates its execution when encountering unsupported markup.
*
Expand Down Expand Up @@ -4501,7 +4556,7 @@ private function step_in_foreign_content(): bool {

$this->state->stack_of_open_elements->pop();
}
return $this->step( self::REPROCESS_CURRENT_NODE );
goto in_foreign_content_process_in_current_insertion_mode;
}

/*
Expand Down Expand Up @@ -4577,6 +4632,7 @@ private function step_in_foreign_content(): bool {
goto in_foreign_content_end_tag_loop;
}

in_foreign_content_process_in_current_insertion_mode:
switch ( $this->state->insertion_mode ) {
case WP_HTML_Processor_State::INSERTION_MODE_INITIAL:
return $this->step_initial();
Expand Down
103 changes: 83 additions & 20 deletions tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
* @group html-api-html5lib-tests
*/
class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
const TREE_INDENT = ' ';

/**
* Skip specific tests that may not be supported or have known issues.
*/
Expand Down Expand Up @@ -139,10 +141,6 @@ public function data_external_html5lib_tests() {
* @return bool True if the test case should be skipped. False otherwise.
*/
private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
if ( null !== $test_context_element && 'body' !== $test_context_element ) {
return true;
}

if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) {
return true;
}
Expand All @@ -158,21 +156,86 @@ private static function should_skip_test( ?string $test_context_element, string
* @return string|null Tree structure of parsed HTML, if supported, else null.
*/
private static function build_tree_representation( ?string $fragment_context, string $html ) {
$processor = $fragment_context
? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" )
: WP_HTML_Processor::create_full_parser( $html );
if ( null === $processor ) {
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
$processor = null;
if ( $fragment_context ) {
if ( 'body' === $fragment_context ) {
$processor = WP_HTML_Processor::create_fragment( $html );
} else {

/*
* If the string of characters starts with "svg ", the context
* element is in the SVG namespace and the substring after
* "svg " is the local name. If the string of characters starts
* with "math ", the context element is in the MathML namespace
* and the substring after "math " is the local name.
* Otherwise, the context element is in the HTML namespace and
* the string is the local name.
*/
if ( str_starts_with( $fragment_context, 'svg ' ) ) {
$tag_name = substr( $fragment_context, 4 );
if ( 'svg' === $tag_name ) {
$parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><svg>' );
} else {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><svg><{$tag_name}>" );
}
$parent_processor->next_tag( $tag_name );
} elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
$tag_name = substr( $fragment_context, 5 );
if ( 'math' === $tag_name ) {
$parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><math>' );
} else {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><math><{$tag_name}>" );
}
$parent_processor->next_tag( $tag_name );
} else {
if ( in_array(
$fragment_context,
array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
'thead',
'tr',
),
true
) ) {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><table><{$fragment_context}>" );
$parent_processor->next_tag();
} else {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><{$fragment_context}>" );
}
$parent_processor->next_tag( $fragment_context );
}
if ( null !== $parent_processor->get_unsupported_exception() ) {
throw $parent_processor->get_unsupported_exception();
}
if ( null !== $parent_processor->get_last_error() ) {
throw new Exception( $parent_processor->get_last_error() );
}
$processor = $parent_processor->spawn_fragment_parser( $html );
}

if ( null === $processor ) {
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
}
} else {
$processor = WP_HTML_Processor::create_full_parser( $html );
if ( null === $processor ) {
throw new Exception( 'Could not create a full parser.' );
}
}

/*
* The fragment parser will start in 2 levels deep at: html > body > [position]
* and requires adjustment to initial parameters.
* The full parser will not.
*/
$output = $fragment_context ? "<html>\n <head>\n <body>\n" : '';
$indent_level = $fragment_context ? 2 : 0;
$indent = ' ';
$output = '';
$indent_level = 0;
$was_text = null;
$text_node = '';

Expand Down Expand Up @@ -225,7 +288,7 @@ private static function build_tree_representation( ?string $fragment_context, st
++$indent_level;
}

$output .= str_repeat( $indent, $tag_indent ) . "<{$tag_name}>\n";
$output .= str_repeat( self::TREE_INDENT, $tag_indent ) . "<{$tag_name}>\n";

$attribute_names = $processor->get_attribute_names_with_prefix( '' );
if ( $attribute_names ) {
Expand Down Expand Up @@ -278,18 +341,18 @@ static function ( $a, $b ) {
if ( true === $val ) {
$val = '';
}
$output .= str_repeat( $indent, $tag_indent + 1 ) . "{$display_name}=\"{$val}\"\n";
$output .= str_repeat( self::TREE_INDENT, $tag_indent + 1 ) . "{$display_name}=\"{$val}\"\n";
}
}

// Self-contained tags contain their inner contents as modifiable text.
$modifiable_text = $processor->get_modifiable_text();
if ( '' !== $modifiable_text ) {
$output .= str_repeat( $indent, $tag_indent + 1 ) . "\"{$modifiable_text}\"\n";
$output .= str_repeat( self::TREE_INDENT, $tag_indent + 1 ) . "\"{$modifiable_text}\"\n";
}

if ( 'html' === $namespace && 'TEMPLATE' === $token_name ) {
$output .= str_repeat( $indent, $indent_level ) . "content\n";
$output .= str_repeat( self::TREE_INDENT, $indent_level ) . "content\n";
++$indent_level;
}

Expand All @@ -303,14 +366,14 @@ static function ( $a, $b ) {
}
$was_text = true;
if ( '' === $text_node ) {
$text_node .= str_repeat( $indent, $indent_level ) . '"';
$text_node .= str_repeat( self::TREE_INDENT, $indent_level ) . '"';
}
$text_node .= $text_content;
break;

case '#funky-comment':
// Comments must be "<" then "!-- " then the data then " -->".
$output .= str_repeat( $indent, $indent_level ) . "<!-- {$processor->get_modifiable_text()} -->\n";
$output .= str_repeat( self::TREE_INDENT, $indent_level ) . "<!-- {$processor->get_modifiable_text()} -->\n";
break;

case '#comment':
Expand All @@ -333,7 +396,7 @@ static function ( $a, $b ) {
throw new Error( "Unhandled comment type for tree construction: {$processor->get_comment_type()}" );
}
// Comments must be "<" then "!-- " then the data then " -->".
$output .= str_repeat( $indent, $indent_level ) . "<!-- {$comment_text_content} -->\n";
$output .= str_repeat( self::TREE_INDENT, $indent_level ) . "<!-- {$comment_text_content} -->\n";
break;

default:
Expand Down Expand Up @@ -449,7 +512,7 @@ public static function parse_html5_dat_testfile( $filename ) {
* context element as context.
*/
case 'document-fragment':
$test_context_element = explode( ' ', $line )[0];
$test_context_element = trim( $line );
break;

/*
Expand Down