From c621467041307f073457779aa21a6737196befcc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 12 Sep 2024 17:53:03 +0200 Subject: [PATCH 01/11] Add spawn_fragment_parser method --- .../html-api/class-wp-html-processor.php | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c50032829f63f..ed68efc52979c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -422,6 +422,55 @@ function ( WP_HTML_Token $token ): void { }; } + /** + * Creates a fragment processor with the current node as its context element. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm + * + * @param string $html Input HTML fragment to process. + * @return static|null The created processor if successful, otherwise null. + */ + private function spawn_fragment_parser( string $html ): ?self { + if ( $this->get_token_type() !== '#tag' ) { + return null; + } + + /* + * Prevent creating fragments at "self-contained" nodes. + * + * @see https://github.com/WordPress/wordpress-develop/pull/7141 + * @see https://github.com/WordPress/wordpress-develop/pull/7198 + */ + if ( + 'html' === $this->get_namespace() && + in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ) { + return null; + } + + $fragment_processor = self::create_fragment( $html ); + $fragment_processor->compat_mode = $this->compat_mode; + + // @todo The context element probably needs a namespace{ + $context_element = array( $this->get_tag(), array() ); + foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { + $context_element[1][ $name ] = $value; + } + $fragment_processor->state->context_node = $context_element; + + if ( 'TEMPLATE' === $context_element[0] ) { + $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + } + + $fragment_processor->reset_insertion_mode_appropriately(); + + // @todo Set the parser's form element pointer. + + $fragment_processor->state->encoding_confidence = 'irrelevant'; + + return $fragment_processor; + } + /** * Stops the parser and terminates its execution when encountering unsupported markup. * From a6a854a6996ab451d371b7d238ca8906088c6722 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 12 Sep 2024 18:13:05 +0200 Subject: [PATCH 02/11] Fix the processor context_node --- src/wp-includes/html-api/class-wp-html-processor.php | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ed68efc52979c..96e975618a454 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -451,11 +451,18 @@ private function spawn_fragment_parser( string $html ): ?self { $fragment_processor = self::create_fragment( $html ); $fragment_processor->compat_mode = $this->compat_mode; - // @todo The context element probably needs a namespace{ $context_element = array( $this->get_tag(), array() ); foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { $context_element[1][ $name ] = $value; } + + $fragment_processor->context_node = new WP_HTML_Token( + 'context-node', + $context_element[0], + $this->has_self_closing_flag() + ); + $fragment_processor->context_node->namespace = $this->get_namespace(); + $fragment_processor->state->context_node = $context_element; if ( 'TEMPLATE' === $context_element[0] ) { From 5d765ee275f50126300921af942e70b57c61da13 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:36:57 +0200 Subject: [PATCH 03/11] Make it public --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 96e975618a454..b97670da3c8ad 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -430,7 +430,7 @@ function ( WP_HTML_Token $token ): void { * @param string $html Input HTML fragment to process. * @return static|null The created processor if successful, otherwise null. */ - private function spawn_fragment_parser( string $html ): ?self { + public function spawn_fragment_parser( string $html ): ?self { if ( $this->get_token_type() !== '#tag' ) { return null; } From da05e8c12a323bb324b6e471b4bec15ea45fa577 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:38:49 +0200 Subject: [PATCH 04/11] Fix spawn_fragment_parser method --- .../html-api/class-wp-html-processor.php | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index b97670da3c8ad..a409bc5a418cd 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -435,6 +435,8 @@ public function spawn_fragment_parser( string $html ): ?self { return null; } + $namespace = $this->get_namespace(); + /* * Prevent creating fragments at "self-contained" nodes. * @@ -442,7 +444,7 @@ public function spawn_fragment_parser( string $html ): ?self { * @see https://github.com/WordPress/wordpress-develop/pull/7198 */ if ( - 'html' === $this->get_namespace() && + 'html' === $namespace && in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { return null; @@ -451,19 +453,17 @@ public function spawn_fragment_parser( string $html ): ?self { $fragment_processor = self::create_fragment( $html ); $fragment_processor->compat_mode = $this->compat_mode; - $context_element = array( $this->get_tag(), array() ); + + $fragment_processor->context_node = clone $this->state->current_token; + $fragment_processor->context_node->bookmark_name = 'context-node'; + $fragment_processor->context_node->on_destroy = null; + + $context_element = array( $fragment_processor->context_node->node_name, array() ); foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { $context_element[1][ $name ] = $value; } - $fragment_processor->context_node = new WP_HTML_Token( - 'context-node', - $context_element[0], - $this->has_self_closing_flag() - ); - $fragment_processor->context_node->namespace = $this->get_namespace(); - - $fragment_processor->state->context_node = $context_element; + $fragment_processor->breadcrumbs = array(); if ( 'TEMPLATE' === $context_element[0] ) { $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; From 411f14f8f1254c4f7596b999d71337f45c0d31a5 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:39:28 +0200 Subject: [PATCH 05/11] Process non-body context tests --- tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 54d60f8c78a66..f8218f48720d3 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -139,10 +139,6 @@ public function data_external_html5lib_tests() { * @return bool True if the test case should be skipped. False otherwise. */ private static function should_skip_test( ?string $test_context_element, string $test_name ): bool { - if ( null !== $test_context_element && 'body' !== $test_context_element ) { - return true; - } - if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) { return true; } From 1701828e23b02b73bb6040bf950aaa4f5f99f171 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:39:59 +0200 Subject: [PATCH 06/11] Fix document-fragment test processing --- tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index f8218f48720d3..312da8451b7b8 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -445,7 +445,7 @@ public static function parse_html5_dat_testfile( $filename ) { * context element as context. */ case 'document-fragment': - $test_context_element = explode( ' ', $line )[0]; + $test_context_element = trim( $line ); break; /* From 2c4a86e6155ad4d42c28a12f923d616fd3ddaab7 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:42:30 +0200 Subject: [PATCH 07/11] Handle all the different document context in html5lib tests --- .../html-api/wpHtmlProcessorHtml5lib.php | 76 +++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 312da8451b7b8..c916ab10b6faf 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -154,11 +154,77 @@ private static function should_skip_test( ?string $test_context_element, string * @return string|null Tree structure of parsed HTML, if supported, else null. */ private static function build_tree_representation( ?string $fragment_context, string $html ) { - $processor = $fragment_context - ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) - : WP_HTML_Processor::create_full_parser( $html ); - if ( null === $processor ) { - throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); + $processor = null; + if ( $fragment_context ) { + if ( 'body' === $fragment_context ) { + $processor = WP_HTML_Processor::create_fragment( $html ); + } else { + + /* + * If the string of characters starts with "svg ", the context + * element is in the SVG namespace and the substring after + * "svg " is the local name. If the string of characters starts + * with "math ", the context element is in the MathML namespace + * and the substring after "math " is the local name. + * Otherwise, the context element is in the HTML namespace and + * the string is the local name. + */ + if ( str_starts_with( $fragment_context, 'svg ' ) ) { + $tag_name = substr( $fragment_context, 4 ); + if ( 'svg' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { + $tag_name = substr( $fragment_context, 5 ); + if ( 'math' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } else { + if ( in_array( + $fragment_context, + array( + 'caption', + 'col', + 'colgroup', + 'tbody', + 'td', + 'tfoot', + 'th', + 'thead', + 'tr', + ), + true + ) ) { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + $parent_processor->next_tag(); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + } + $parent_processor->next_tag( $fragment_context ); + } + if ( null !== $parent_processor->get_unsupported_exception() ) { + throw $parent_processor->get_unsupported_exception(); + } + if ( null !== $parent_processor->get_last_error() ) { + throw new Exception( $parent_processor->get_last_error() ); + } + $processor = $parent_processor->spawn_fragment_parser( $html ); + } + + if ( null === $processor ) { + throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); + } + } else { + $processor = WP_HTML_Processor::create_full_parser( $html ); + if ( null === $processor ) { + throw new Exception( 'Could not create a full parser.' ); + } } /* From 886f919fec05db01216816b4d30ea242b7880b17 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:43:42 +0200 Subject: [PATCH 08/11] Remove outdated fragment preambles from tree output When fragments were used instead of full processors, some fake tree structure had to be prepended to the output. This is no longer necessary and breaks the fragment cases. --- tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index c916ab10b6faf..ad6080461f2b8 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -232,8 +232,8 @@ private static function build_tree_representation( ?string $fragment_context, st * and requires adjustment to initial parameters. * The full parser will not. */ - $output = $fragment_context ? "\n \n \n" : ''; - $indent_level = $fragment_context ? 2 : 0; + $output = ''; + $indent_level = 0; $indent = ' '; $was_text = null; $text_node = ''; From 430faf1ea1e6362ba1b2a369aaf1c6d190258b95 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:45:05 +0200 Subject: [PATCH 09/11] Move static indent string to a class constant --- .../tests/html-api/wpHtmlProcessorHtml5lib.php | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index ad6080461f2b8..f6b518b96c940 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -21,6 +21,8 @@ * @group html-api-html5lib-tests */ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { + const TREE_INDENT = ' '; + /** * Skip specific tests that may not be supported or have known issues. */ @@ -234,7 +236,6 @@ private static function build_tree_representation( ?string $fragment_context, st */ $output = ''; $indent_level = 0; - $indent = ' '; $was_text = null; $text_node = ''; @@ -287,7 +288,7 @@ private static function build_tree_representation( ?string $fragment_context, st ++$indent_level; } - $output .= str_repeat( $indent, $tag_indent ) . "<{$tag_name}>\n"; + $output .= str_repeat( self::TREE_INDENT, $tag_indent ) . "<{$tag_name}>\n"; $attribute_names = $processor->get_attribute_names_with_prefix( '' ); if ( $attribute_names ) { @@ -340,18 +341,18 @@ static function ( $a, $b ) { if ( true === $val ) { $val = ''; } - $output .= str_repeat( $indent, $tag_indent + 1 ) . "{$display_name}=\"{$val}\"\n"; + $output .= str_repeat( self::TREE_INDENT, $tag_indent + 1 ) . "{$display_name}=\"{$val}\"\n"; } } // Self-contained tags contain their inner contents as modifiable text. $modifiable_text = $processor->get_modifiable_text(); if ( '' !== $modifiable_text ) { - $output .= str_repeat( $indent, $tag_indent + 1 ) . "\"{$modifiable_text}\"\n"; + $output .= str_repeat( self::TREE_INDENT, $tag_indent + 1 ) . "\"{$modifiable_text}\"\n"; } if ( 'html' === $namespace && 'TEMPLATE' === $token_name ) { - $output .= str_repeat( $indent, $indent_level ) . "content\n"; + $output .= str_repeat( self::TREE_INDENT, $indent_level ) . "content\n"; ++$indent_level; } @@ -365,14 +366,14 @@ static function ( $a, $b ) { } $was_text = true; if ( '' === $text_node ) { - $text_node .= str_repeat( $indent, $indent_level ) . '"'; + $text_node .= str_repeat( self::TREE_INDENT, $indent_level ) . '"'; } $text_node .= $text_content; break; case '#funky-comment': // Comments must be "<" then "!-- " then the data then " -->". - $output .= str_repeat( $indent, $indent_level ) . "\n"; + $output .= str_repeat( self::TREE_INDENT, $indent_level ) . "\n"; break; case '#comment': @@ -395,7 +396,7 @@ static function ( $a, $b ) { throw new Error( "Unhandled comment type for tree construction: {$processor->get_comment_type()}" ); } // Comments must be "<" then "!-- " then the data then " -->". - $output .= str_repeat( $indent, $indent_level ) . "\n"; + $output .= str_repeat( self::TREE_INDENT, $indent_level ) . "\n"; break; default: From 9ffc53ab760cdc9fe0aa3760c5e96e6a0a95f905 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:45:27 +0200 Subject: [PATCH 10/11] PICKME: Ensure that nodes are correctly processed in HTML content This condition: > Reprocess the token according to the rules given in the section > corresponding to the current insertion mode in HTML content. Was resulting in an infinite loop in fragment cases. In full documents, after popping nodes the context is always moved so that foreign content parsing is not used. This is not guaranteed in a fragment and could cause an infinite loop. --- src/wp-includes/html-api/class-wp-html-processor.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index a409bc5a418cd..eec4cfa976685 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -4557,7 +4557,7 @@ private function step_in_foreign_content(): bool { $this->state->stack_of_open_elements->pop(); } - return $this->step( self::REPROCESS_CURRENT_NODE ); + goto in_foreign_content_process_in_current_insertion_mode; } /* @@ -4633,6 +4633,7 @@ private function step_in_foreign_content(): bool { goto in_foreign_content_end_tag_loop; } + in_foreign_content_process_in_current_insertion_mode: switch ( $this->state->insertion_mode ) { case WP_HTML_Processor_State::INSERTION_MODE_INITIAL: return $this->step_initial(); From 96a0ae77ea56fd3a80366307e555c270e2d31d77 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 13:45:28 +0200 Subject: [PATCH 11/11] lints --- src/wp-includes/html-api/class-wp-html-processor.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index eec4cfa976685..5bf69bab3ebbe 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -453,7 +453,6 @@ public function spawn_fragment_parser( string $html ): ?self { $fragment_processor = self::create_fragment( $html ); $fragment_processor->compat_mode = $this->compat_mode; - $fragment_processor->context_node = clone $this->state->current_token; $fragment_processor->context_node->bookmark_name = 'context-node'; $fragment_processor->context_node->on_destroy = null;