From d8ac3610365caf54b86ccb75cf9b3390c0e49c2f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 12 Sep 2024 17:53:03 +0200 Subject: [PATCH 01/25] Add spawn_fragment_parser method --- .../html-api/class-wp-html-processor.php | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ed6ac0299b3c3..ad20c36e6205b 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -424,6 +424,55 @@ function ( WP_HTML_Token $token ): void { }; } + /** + * Creates a fragment processor with the current node as its context element. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm + * + * @param string $html Input HTML fragment to process. + * @return static|null The created processor if successful, otherwise null. + */ + private function spawn_fragment_parser( string $html ): ?self { + if ( $this->get_token_type() !== '#tag' ) { + return null; + } + + /* + * Prevent creating fragments at "self-contained" nodes. + * + * @see https://github.com/WordPress/wordpress-develop/pull/7141 + * @see https://github.com/WordPress/wordpress-develop/pull/7198 + */ + if ( + 'html' === $this->get_namespace() && + in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ) { + return null; + } + + $fragment_processor = self::create_fragment( $html ); + $fragment_processor->compat_mode = $this->compat_mode; + + // @todo The context element probably needs a namespace{ + $context_element = array( $this->get_tag(), array() ); + foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { + $context_element[1][ $name ] = $value; + } + $fragment_processor->state->context_node = $context_element; + + if ( 'TEMPLATE' === $context_element[0] ) { + $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + } + + $fragment_processor->reset_insertion_mode_appropriately(); + + // @todo Set the parser's form element pointer. + + $fragment_processor->state->encoding_confidence = 'irrelevant'; + + return $fragment_processor; + } + /** * Stops the parser and terminates its execution when encountering unsupported markup. * From ad8f8db5589d3d88061dd714e8cf17a994fc9d55 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 12 Sep 2024 18:13:05 +0200 Subject: [PATCH 02/25] Fix the processor context_node --- src/wp-includes/html-api/class-wp-html-processor.php | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ad20c36e6205b..07a7b31450cd4 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -453,11 +453,18 @@ private function spawn_fragment_parser( string $html ): ?self { $fragment_processor = self::create_fragment( $html ); $fragment_processor->compat_mode = $this->compat_mode; - // @todo The context element probably needs a namespace{ $context_element = array( $this->get_tag(), array() ); foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { $context_element[1][ $name ] = $value; } + + $fragment_processor->context_node = new WP_HTML_Token( + 'context-node', + $context_element[0], + $this->has_self_closing_flag() + ); + $fragment_processor->context_node->namespace = $this->get_namespace(); + $fragment_processor->state->context_node = $context_element; if ( 'TEMPLATE' === $context_element[0] ) { From e2efee4d844eef86cd8146427f3b55466f6e3abb Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:36:57 +0200 Subject: [PATCH 03/25] Make it public --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 07a7b31450cd4..073e23285d40b 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -432,7 +432,7 @@ function ( WP_HTML_Token $token ): void { * @param string $html Input HTML fragment to process. * @return static|null The created processor if successful, otherwise null. */ - private function spawn_fragment_parser( string $html ): ?self { + public function spawn_fragment_parser( string $html ): ?self { if ( $this->get_token_type() !== '#tag' ) { return null; } From 4f5249c82ad100c38ccca30ccc8ebe9999fd91d6 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:38:49 +0200 Subject: [PATCH 04/25] Fix spawn_fragment_parser method --- .../html-api/class-wp-html-processor.php | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 073e23285d40b..24a02a26819a7 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -437,6 +437,8 @@ public function spawn_fragment_parser( string $html ): ?self { return null; } + $namespace = $this->get_namespace(); + /* * Prevent creating fragments at "self-contained" nodes. * @@ -444,7 +446,7 @@ public function spawn_fragment_parser( string $html ): ?self { * @see https://github.com/WordPress/wordpress-develop/pull/7198 */ if ( - 'html' === $this->get_namespace() && + 'html' === $namespace && in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { return null; @@ -453,19 +455,17 @@ public function spawn_fragment_parser( string $html ): ?self { $fragment_processor = self::create_fragment( $html ); $fragment_processor->compat_mode = $this->compat_mode; - $context_element = array( $this->get_tag(), array() ); + + $fragment_processor->context_node = clone $this->state->current_token; + $fragment_processor->context_node->bookmark_name = 'context-node'; + $fragment_processor->context_node->on_destroy = null; + + $context_element = array( $fragment_processor->context_node->node_name, array() ); foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { $context_element[1][ $name ] = $value; } - $fragment_processor->context_node = new WP_HTML_Token( - 'context-node', - $context_element[0], - $this->has_self_closing_flag() - ); - $fragment_processor->context_node->namespace = $this->get_namespace(); - - $fragment_processor->state->context_node = $context_element; + $fragment_processor->breadcrumbs = array(); if ( 'TEMPLATE' === $context_element[0] ) { $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; From eaed8634dee6e084d63d597969a72d077c5c382f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:39:28 +0200 Subject: [PATCH 05/25] Process non-body context tests --- tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 808fa39d17f26..7f607ad63ebfc 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -138,10 +138,6 @@ public function data_external_html5lib_tests() { * @return bool True if the test case should be skipped. False otherwise. */ private static function should_skip_test( ?string $test_context_element, string $test_name ): bool { - if ( null !== $test_context_element && 'body' !== $test_context_element ) { - return true; - } - if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) { return true; } From 25b18fa88d860b83ff7b126a12f37b205bfc13c3 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 12:42:30 +0200 Subject: [PATCH 06/25] Handle all the different document context in html5lib tests --- .../html-api/wpHtmlProcessorHtml5lib.php | 76 +++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 7f607ad63ebfc..041132ed50c20 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -153,11 +153,77 @@ private static function should_skip_test( ?string $test_context_element, string * @return string|null Tree structure of parsed HTML, if supported, else null. */ private static function build_tree_representation( ?string $fragment_context, string $html ) { - $processor = $fragment_context - ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) - : WP_HTML_Processor::create_full_parser( $html ); - if ( null === $processor ) { - throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); + $processor = null; + if ( $fragment_context ) { + if ( 'body' === $fragment_context ) { + $processor = WP_HTML_Processor::create_fragment( $html ); + } else { + + /* + * If the string of characters starts with "svg ", the context + * element is in the SVG namespace and the substring after + * "svg " is the local name. If the string of characters starts + * with "math ", the context element is in the MathML namespace + * and the substring after "math " is the local name. + * Otherwise, the context element is in the HTML namespace and + * the string is the local name. + */ + if ( str_starts_with( $fragment_context, 'svg ' ) ) { + $tag_name = substr( $fragment_context, 4 ); + if ( 'svg' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { + $tag_name = substr( $fragment_context, 5 ); + if ( 'math' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } else { + if ( in_array( + $fragment_context, + array( + 'caption', + 'col', + 'colgroup', + 'tbody', + 'td', + 'tfoot', + 'th', + 'thead', + 'tr', + ), + true + ) ) { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + $parent_processor->next_tag(); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + } + $parent_processor->next_tag( $fragment_context ); + } + if ( null !== $parent_processor->get_unsupported_exception() ) { + throw $parent_processor->get_unsupported_exception(); + } + if ( null !== $parent_processor->get_last_error() ) { + throw new Exception( $parent_processor->get_last_error() ); + } + $processor = $parent_processor->spawn_fragment_parser( $html ); + } + + if ( null === $processor ) { + throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); + } + } else { + $processor = WP_HTML_Processor::create_full_parser( $html ); + if ( null === $processor ) { + throw new Exception( 'Could not create a full parser.' ); + } } /* From 9ac142f67a2a91a43a2daa669d467b5acf8efcfc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Sep 2024 13:45:28 +0200 Subject: [PATCH 07/25] lints --- src/wp-includes/html-api/class-wp-html-processor.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 24a02a26819a7..bce9949d286b0 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -455,7 +455,6 @@ public function spawn_fragment_parser( string $html ): ?self { $fragment_processor = self::create_fragment( $html ); $fragment_processor->compat_mode = $this->compat_mode; - $fragment_processor->context_node = clone $this->state->current_token; $fragment_processor->context_node->bookmark_name = 'context-node'; $fragment_processor->context_node->on_destroy = null; From 3f35886e4abe09c3a51e63fc8c88a680418f05b6 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Nov 2024 15:49:25 +0100 Subject: [PATCH 08/25] Make spawned fragment parse have HTML > [context-node-tag] in breadcrumbs --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index b34ea958833f1..8daa92eabacb4 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -464,7 +464,7 @@ public function spawn_fragment_parser( string $html ): ?self { $context_element[1][ $name ] = $value; } - $fragment_processor->breadcrumbs = array(); + $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name ); if ( 'TEMPLATE' === $context_element[0] ) { $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; From ba9e218a32a0e401e1b5e2473f5f090be86d4e73 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Nov 2024 15:51:22 +0100 Subject: [PATCH 09/25] Fallback to context node when checking namespace --- src/wp-includes/html-api/class-wp-html-processor.php | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8daa92eabacb4..ba02ae5296dc2 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -4968,16 +4968,20 @@ private function bookmark_token() { */ /** - * Indicates the namespace of the current token, or "html" if there is none. + * Indicates the namespace of the current token, the context node, or "html". * * @return string One of "html", "math", or "svg". */ public function get_namespace(): string { - if ( ! isset( $this->current_element ) ) { - return parent::get_namespace(); + if ( isset( $this->current_element ) ) { + return $this->current_element->token->namespace; + } + + if ( isset( $this->context_node ) ) { + return $this->context_node->namespace; } - return $this->current_element->token->namespace; + return parent::get_namespace(); } /** From fe48fa517b2b90cd10f435d5400e43613aa9ce26 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Nov 2024 20:15:20 +0100 Subject: [PATCH 10/25] Add tests --- .../tests/html-api/wpHtmlProcessor.php | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 7e568286ccdf9..db6af678ea9a9 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -1058,4 +1058,59 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to $this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) ); $this->assertEquals( $expected_xpaths, $actual_xpaths, 'Snapshot: ' . var_export( $actual_xpaths, true ) ); } + + /** + * @ticket TBD + */ + public function test_spawn_fragment_parser_in_foreign_content() { + $processor = WP_HTML_Processor::create_full_parser( '' ); + $this->assertTrue( $processor->next_tag( 'SVG' ) ); + + $fragment = $processor->spawn_fragment_parser( "\0preceded-by-nul-byte
" ); + + $this->assertSame( 'svg', $fragment->get_namespace() ); + $this->assertTrue( $fragment->next_token() ); + + /* + * In HTML parsing, a nul byte would be ignored. + * In SVG it should be replaced with a replacement character. + */ + $this->assertSame( '#text', $fragment->get_token_type() ); + $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() ); + + $this->assertTrue( $fragment->next_tag( 'RECT' ) ); + $this->assertSame( 'svg', $fragment->get_namespace() ); + + $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) ); + $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() ); + $this->assertTrue( $fragment->next_tag( 'foreignObject' ) ); + $this->assertSame( 'svg', $fragment->get_namespace() ); + } + + /** + * @ticket TBD + */ + public function test_spawn_fragment_parser_in_foreign_content_integration_point() { + $processor = WP_HTML_Processor::create_full_parser( '' ); + $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); + + $fragment = $processor->spawn_fragment_parser( "\0not-preceded-by-nul-byte" ); + + $this->assertSame( 'svg', $fragment->get_namespace() ); + $this->assertTrue( $fragment->next_token() ); + + // In HTML parsing, the nul byte is ignored and the text is reached. + $this->assertSame( '#text', $fragment->get_token_type() ); + $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() ); + + /* + * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace. + * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close. + */ + $this->assertTrue( $fragment->next_tag( 'RECT' ) ); + $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() ); + $this->assertSame( 'html', $fragment->get_namespace() ); + $this->assertTrue( $fragment->has_self_closing_flag() ); + $this->assertTrue( $fragment->expects_closer() ); + } } From fa4c5cb59aeeef9fb0736c745f8e0ff18f0ad0bf Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Nov 2024 20:15:36 +0100 Subject: [PATCH 11/25] Set the form element pointer on the fragment parser --- .../html-api/class-wp-html-processor.php | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ba02ae5296dc2..df0dd13ffe76d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -472,7 +472,18 @@ public function spawn_fragment_parser( string $html ): ?self { $fragment_processor->reset_insertion_mode_appropriately(); - // @todo Set the parser's form element pointer. + /* + * > Set the parser's form element pointer to the nearest node to the context element that + * > is a form element (going straight up the ancestor chain, and including the element + * > itself, if it is a form element), if any. (If there is no such form element, the + * > form element pointer keeps its initial value, null.) + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { + if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) { + $fragment_processor->state->form_element = $element; + break; + } + } $fragment_processor->state->encoding_confidence = 'irrelevant'; From 943bbdde4a158ef360e040a4e2956dfdf349082a Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Nov 2024 20:43:30 +0100 Subject: [PATCH 12/25] Revert "Fallback to context node when checking namespace" This reverts commit ba9e218a32a0e401e1b5e2473f5f090be86d4e73. --- src/wp-includes/html-api/class-wp-html-processor.php | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 0574783a54d77..cd2802e959e2e 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -4984,20 +4984,16 @@ private function bookmark_token() { */ /** - * Indicates the namespace of the current token, the context node, or "html". + * Indicates the namespace of the current token, or "html" if there is none. * * @return string One of "html", "math", or "svg". */ public function get_namespace(): string { - if ( isset( $this->current_element ) ) { - return $this->current_element->token->namespace; - } - - if ( isset( $this->context_node ) ) { - return $this->context_node->namespace; + if ( ! isset( $this->current_element ) ) { + return parent::get_namespace(); } - return parent::get_namespace(); + return $this->current_element->token->namespace; } /** From e3a0a8685800729e5cc1e49b3f5f1e7d5306c89f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Nov 2024 21:01:21 +0100 Subject: [PATCH 13/25] Fix initial namespace on integration nodes --- src/wp-includes/html-api/class-wp-html-processor.php | 9 +++++++-- tests/phpunit/tests/html-api/wpHtmlProcessor.php | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index cd2802e959e2e..ed926c45cfa76 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -437,7 +437,7 @@ public function spawn_fragment_parser( string $html ): ?self { return null; } - $namespace = $this->get_namespace(); + $namespace = $this->current_element->token->namespace; /* * Prevent creating fragments at "self-contained" nodes. @@ -452,7 +452,12 @@ public function spawn_fragment_parser( string $html ): ?self { return null; } - $fragment_processor = self::create_fragment( $html ); + $fragment_processor = self::create_fragment( $html ); + + $fragment_processor->change_parsing_namespace( + $this->current_element->token->integration_node_type ? 'html' : $namespace + ); + $fragment_processor->compat_mode = $this->compat_mode; $fragment_processor->context_node = clone $this->state->current_token; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 9a89d3f528958..851a10bdf3b39 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -1053,9 +1053,14 @@ public function test_spawn_fragment_parser_in_foreign_content_integration_point( $processor = WP_HTML_Processor::create_full_parser( '' ); $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); - $fragment = $processor->spawn_fragment_parser( "\0not-preceded-by-nul-byte" ); + $fragment = $processor->spawn_fragment_parser( "\0not-preceded-by-nul-byte" ); + + // Nothing has been processed, the html namespace should be used for parsing as an integration point. + $this->assertSame( 'html', $fragment->get_namespace() ); + + // HTML parsing transforms IMAGE into IMG. + $this->assertTrue( $fragment->next_tag( 'IMG' ) ); - $this->assertSame( 'svg', $fragment->get_namespace() ); $this->assertTrue( $fragment->next_token() ); // In HTML parsing, the nul byte is ignored and the text is reached. From 27a978146df84499c9a9cd31aa57620c751df01f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 7 Nov 2024 17:28:40 +0100 Subject: [PATCH 14/25] Rename method, use static constructor, add comments --- .../html-api/class-wp-html-processor.php | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ed926c45cfa76..9efcdd90aa15c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -425,14 +425,23 @@ function ( WP_HTML_Token $token ): void { } /** - * Creates a fragment processor with the current node as its context element. + * Creates a fragment processor at the current node. + * + * HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be + * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment()`. + * + * The context node may impact how a fragment of HTML is parsed. For example, when parsing + * `AB`: + * + * With a BODY context node results in the following tree: + * * * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm * * @param string $html Input HTML fragment to process. * @return static|null The created processor if successful, otherwise null. */ - public function spawn_fragment_parser( string $html ): ?self { + public function create_fragment_at_current_node( string $html ) { if ( $this->get_token_type() !== '#tag' ) { return null; } @@ -452,7 +461,7 @@ public function spawn_fragment_parser( string $html ): ?self { return null; } - $fragment_processor = self::create_fragment( $html ); + $fragment_processor = static::create_fragment( $html ); $fragment_processor->change_parsing_namespace( $this->current_element->token->integration_node_type ? 'html' : $namespace From 07895389209e08913c6261f63f7bbe8422812754 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 8 Nov 2024 19:56:42 +0100 Subject: [PATCH 15/25] Update method name in tests --- tests/phpunit/tests/html-api/wpHtmlProcessor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 851a10bdf3b39..864093b66c0a8 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -1021,11 +1021,11 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to /** * @ticket TBD */ - public function test_spawn_fragment_parser_in_foreign_content() { + public function test_create_fragment_at_current_node_in_foreign_content() { $processor = WP_HTML_Processor::create_full_parser( '' ); $this->assertTrue( $processor->next_tag( 'SVG' ) ); - $fragment = $processor->spawn_fragment_parser( "\0preceded-by-nul-byte
" ); + $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" ); $this->assertSame( 'svg', $fragment->get_namespace() ); $this->assertTrue( $fragment->next_token() ); @@ -1049,11 +1049,11 @@ public function test_spawn_fragment_parser_in_foreign_content() { /** * @ticket TBD */ - public function test_spawn_fragment_parser_in_foreign_content_integration_point() { + public function test_create_fragment_at_current_node_in_foreign_content_integration_point() { $processor = WP_HTML_Processor::create_full_parser( '' ); $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); - $fragment = $processor->spawn_fragment_parser( "\0not-preceded-by-nul-byte" ); + $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte" ); // Nothing has been processed, the html namespace should be used for parsing as an integration point. $this->assertSame( 'html', $fragment->get_namespace() ); From 5e8b82ed6025f325a5188d63788e30e30927d177 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 8 Nov 2024 19:58:54 +0100 Subject: [PATCH 16/25] Add ticket to tests --- tests/phpunit/tests/html-api/wpHtmlProcessor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 864093b66c0a8..68dbdf2817d33 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -1019,7 +1019,7 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to } /** - * @ticket TBD + * @ticket 62357 */ public function test_create_fragment_at_current_node_in_foreign_content() { $processor = WP_HTML_Processor::create_full_parser( '' ); @@ -1047,7 +1047,7 @@ public function test_create_fragment_at_current_node_in_foreign_content() { } /** - * @ticket TBD + * @ticket 62357 */ public function test_create_fragment_at_current_node_in_foreign_content_integration_point() { $processor = WP_HTML_Processor::create_full_parser( '' ); From 37f9ff4d943a769337668ae4bc1394bf11fe2846 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 12 Nov 2024 11:59:55 +0100 Subject: [PATCH 17/25] Update method name in html5lib tests --- tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 041132ed50c20..aaef30dd09b5b 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -213,7 +213,7 @@ private static function build_tree_representation( ?string $fragment_context, st if ( null !== $parent_processor->get_last_error() ) { throw new Exception( $parent_processor->get_last_error() ); } - $processor = $parent_processor->spawn_fragment_parser( $html ); + $processor = $parent_processor->create_fragment_at_current_node( $html ); } if ( null === $processor ) { From 80ae6f267ec6797db16855e82d445b93749c297f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 12 Nov 2024 12:36:09 +0100 Subject: [PATCH 18/25] Handle null return from create_fragment null should not be returned in this case, but it is part of the signature and should be covered here. --- src/wp-includes/html-api/class-wp-html-processor.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 090e4491bf0f7..87c413250daa7 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -462,6 +462,9 @@ public function create_fragment_at_current_node( string $html ) { } $fragment_processor = static::create_fragment( $html ); + if ( null === $fragment_processor ) { + return null; + } $fragment_processor->change_parsing_namespace( $this->current_element->token->integration_node_type ? 'html' : $namespace From 98664028b150e29a957efe7439b07c61fae88143 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 12 Nov 2024 12:43:04 +0100 Subject: [PATCH 19/25] Use a cloned copy of the FORM element from the parent processor --- src/wp-includes/html-api/class-wp-html-processor.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 87c413250daa7..4738c0cc90932 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -497,7 +497,9 @@ public function create_fragment_at_current_node( string $html ) { */ foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) { - $fragment_processor->state->form_element = $element; + $fragment_processor->state->form_element = clone $element; + $fragment_processor->state->form_element->bookmark_name = null; + $fragment_processor->state->form_element->on_destroy = null; break; } } From bcebeba51ee3be953b9c6e05275996d3432730b0 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 12 Nov 2024 15:50:54 +0100 Subject: [PATCH 20/25] Remove stale comment --- tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index aaef30dd09b5b..7abe63a859954 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -226,11 +226,6 @@ private static function build_tree_representation( ?string $fragment_context, st } } - /* - * The fragment parser will start in 2 levels deep at: html > body > [position] - * and requires adjustment to initial parameters. - * The full parser will not. - */ $output = ''; $indent_level = 0; $was_text = null; From 9e11f195d3173adb2a119bdd6f81fef813426486 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 12 Nov 2024 18:18:05 +0100 Subject: [PATCH 21/25] Improve method documentation with examples --- .../html-api/class-wp-html-processor.php | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 4738c0cc90932..0e492ff03b8e7 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -428,13 +428,35 @@ function ( WP_HTML_Token $token ): void { * Creates a fragment processor at the current node. * * HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be - * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment()`. + * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`. * - * The context node may impact how a fragment of HTML is parsed. For example, when parsing - * `AB`: + * The context node may impact how a fragment of HTML is parsed. For example, consider the HTML + * fragment `
`. * * With a BODY context node results in the following tree: * + * └─#text Inside TD? + * + * Notice that the ``. * - * With a BODY context node results in the following tree: + * A BODY context node will produce the following tree: * * └─#text Inside TD? * @@ -460,8 +460,8 @@ function ( WP_HTML_Token $token ): void { * * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm * - * @param string $html Input HTML fragment to process. - * @return static|null The created processor if successful, otherwise null. + * @param string $html Input HTML fragment to process. + * @return static|null The created processor if successful, otherwise null. */ public function create_fragment_at_current_node( string $html ) { if ( $this->get_token_type() !== '#tag' ) { From 94408901e85d185a62eb10affde89d92e142a17d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 20 Nov 2024 17:58:46 +0100 Subject: [PATCH 23/25] Improve comment, add PLAINTEXT --- src/wp-includes/html-api/class-wp-html-processor.php | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6319a22cf3215..2558128b27fac 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -471,14 +471,12 @@ public function create_fragment_at_current_node( string $html ) { $namespace = $this->current_element->token->namespace; /* - * Prevent creating fragments at "self-contained" nodes. - * - * @see https://github.com/WordPress/wordpress-develop/pull/7141 - * @see https://github.com/WordPress/wordpress-develop/pull/7198 + * Prevent creating fragments at nodes that require a special tokenizer state. + * This is unsupported by the HTML Processor. */ if ( 'html' === $namespace && - in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) ) { return null; } From 25ae695b062a8611a4410605979a862c12becb35 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 20 Nov 2024 18:27:15 +0100 Subject: [PATCH 24/25] Pull in relevant fixes/improvements from #7777 --- .../html-api/class-wp-html-processor.php | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 2558128b27fac..b5a87c557e18d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -476,7 +476,7 @@ public function create_fragment_at_current_node( string $html ) { */ if ( 'html' === $namespace && - in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) + in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) ) { return null; } @@ -486,24 +486,20 @@ public function create_fragment_at_current_node( string $html ) { return null; } - $fragment_processor->change_parsing_namespace( - $this->current_element->token->integration_node_type ? 'html' : $namespace - ); - $fragment_processor->compat_mode = $this->compat_mode; $fragment_processor->context_node = clone $this->state->current_token; $fragment_processor->context_node->bookmark_name = 'context-node'; $fragment_processor->context_node->on_destroy = null; - $context_element = array( $fragment_processor->context_node->node_name, array() ); + $fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() ); foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { - $context_element[1][ $name ] = $value; + $fragment_processor->state->context_node[1][ $name ] = $value; } $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name ); - if ( 'TEMPLATE' === $context_element[0] ) { + if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) { $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; } @@ -526,6 +522,15 @@ public function create_fragment_at_current_node( string $html ) { $fragment_processor->state->encoding_confidence = 'irrelevant'; + /* + * Update the parsing namespace near the end of the process. + * This is important so that any push/pop from the stack of open + * elements does not change the parsing namespace. + */ + $fragment_processor->change_parsing_namespace( + $this->current_element->token->integration_node_type ? 'html' : $namespace + ); + return $fragment_processor; } From 0662156cb19b55f86c1b233b54622b8744165e5e Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 21 Nov 2024 11:48:53 +0100 Subject: [PATCH 25/25] Fix context node attributes --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8f56d46acb238..39196499fa5af 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -493,8 +493,12 @@ public function create_fragment_at_current_node( string $html ) { $fragment_processor->context_node->on_destroy = null; $fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() ); - foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { - $fragment_processor->state->context_node[1][ $name ] = $value; + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( null !== $attribute_names ) { + foreach ( $attribute_names as $name ) { + $fragment_processor->state->context_node[1][ $name ] = $this->get_attribute( $name ); + } } $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name );
Inside TD?` tags are completely ignored. + * + * Compare that with an SVG context node that produces the following tree: + * + * ├─svg:td + * └─#text Inside TD? + * + * Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected. + * This is a peculiarity of parsing HTML in foreign content like SVG. + * + * Finally, consider the tree produced with a TABLE context node: + * + * └─TBODY + * └─TR + * └─TD + * └─#text Inside TD? + * + * These examples demonstrate how important the context node may be when processing an HTML + * fragment. Special care must be taken when processing fragments that are expected to appear + * in specific contexts. SVG and TABLE are good examples, but there are others. * * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm * From a02e23835e5b8165c1222b16ee7279454635cd26 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 20 Nov 2024 17:19:18 +0100 Subject: [PATCH 22/25] Improve docblock language and formatting Co-authored-by: Bernie Reiter <96308+ockham@users.noreply.github.com> --- src/wp-includes/html-api/class-wp-html-processor.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 47d35f1009e27..6319a22cf3215 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -433,7 +433,7 @@ function ( WP_HTML_Token $token ): void { * The context node may impact how a fragment of HTML is parsed. For example, consider the HTML * fragment `Inside TD?