diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index cfcd6bdb94e94..39196499fa5af 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -424,6 +424,120 @@ function ( WP_HTML_Token $token ): void { }; } + /** + * Creates a fragment processor at the current node. + * + * HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be + * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`. + * + * The context node may impact how a fragment of HTML is parsed. For example, consider the HTML + * fragment `Inside TD?`. + * + * A BODY context node will produce the following tree: + * + * └─#text Inside TD? + * + * Notice that the `` tags are completely ignored. + * + * Compare that with an SVG context node that produces the following tree: + * + * ├─svg:td + * └─#text Inside TD? + * + * Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected. + * This is a peculiarity of parsing HTML in foreign content like SVG. + * + * Finally, consider the tree produced with a TABLE context node: + * + * └─TBODY + * └─TR + * └─TD + * └─#text Inside TD? + * + * These examples demonstrate how important the context node may be when processing an HTML + * fragment. Special care must be taken when processing fragments that are expected to appear + * in specific contexts. SVG and TABLE are good examples, but there are others. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm + * + * @param string $html Input HTML fragment to process. + * @return static|null The created processor if successful, otherwise null. + */ + public function create_fragment_at_current_node( string $html ) { + if ( $this->get_token_type() !== '#tag' ) { + return null; + } + + $namespace = $this->current_element->token->namespace; + + /* + * Prevent creating fragments at nodes that require a special tokenizer state. + * This is unsupported by the HTML Processor. + */ + if ( + 'html' === $namespace && + in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) + ) { + return null; + } + + $fragment_processor = static::create_fragment( $html ); + if ( null === $fragment_processor ) { + return null; + } + + $fragment_processor->compat_mode = $this->compat_mode; + + $fragment_processor->context_node = clone $this->state->current_token; + $fragment_processor->context_node->bookmark_name = 'context-node'; + $fragment_processor->context_node->on_destroy = null; + + $fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() ); + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( null !== $attribute_names ) { + foreach ( $attribute_names as $name ) { + $fragment_processor->state->context_node[1][ $name ] = $this->get_attribute( $name ); + } + } + + $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name ); + + if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) { + $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + } + + $fragment_processor->reset_insertion_mode_appropriately(); + + /* + * > Set the parser's form element pointer to the nearest node to the context element that + * > is a form element (going straight up the ancestor chain, and including the element + * > itself, if it is a form element), if any. (If there is no such form element, the + * > form element pointer keeps its initial value, null.) + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { + if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) { + $fragment_processor->state->form_element = clone $element; + $fragment_processor->state->form_element->bookmark_name = null; + $fragment_processor->state->form_element->on_destroy = null; + break; + } + } + + $fragment_processor->state->encoding_confidence = 'irrelevant'; + + /* + * Update the parsing namespace near the end of the process. + * This is important so that any push/pop from the stack of open + * elements does not change the parsing namespace. + */ + $fragment_processor->change_parsing_namespace( + $this->current_element->token->integration_node_type ? 'html' : $namespace + ); + + return $fragment_processor; + } + /** * Stops the parser and terminates its execution when encountering unsupported markup. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 1ca60e691f03e..a19af13c78925 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -1043,6 +1043,66 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to $this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) ); } + /** + * @ticket 62357 + */ + public function test_create_fragment_at_current_node_in_foreign_content() { + $processor = WP_HTML_Processor::create_full_parser( '' ); + $this->assertTrue( $processor->next_tag( 'SVG' ) ); + + $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" ); + + $this->assertSame( 'svg', $fragment->get_namespace() ); + $this->assertTrue( $fragment->next_token() ); + + /* + * In HTML parsing, a nul byte would be ignored. + * In SVG it should be replaced with a replacement character. + */ + $this->assertSame( '#text', $fragment->get_token_type() ); + $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() ); + + $this->assertTrue( $fragment->next_tag( 'RECT' ) ); + $this->assertSame( 'svg', $fragment->get_namespace() ); + + $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) ); + $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() ); + $this->assertTrue( $fragment->next_tag( 'foreignObject' ) ); + $this->assertSame( 'svg', $fragment->get_namespace() ); + } + + /** + * @ticket 62357 + */ + public function test_create_fragment_at_current_node_in_foreign_content_integration_point() { + $processor = WP_HTML_Processor::create_full_parser( '' ); + $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); + + $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte" ); + + // Nothing has been processed, the html namespace should be used for parsing as an integration point. + $this->assertSame( 'html', $fragment->get_namespace() ); + + // HTML parsing transforms IMAGE into IMG. + $this->assertTrue( $fragment->next_tag( 'IMG' ) ); + + $this->assertTrue( $fragment->next_token() ); + + // In HTML parsing, the nul byte is ignored and the text is reached. + $this->assertSame( '#text', $fragment->get_token_type() ); + $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() ); + + /* + * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace. + * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close. + */ + $this->assertTrue( $fragment->next_tag( 'RECT' ) ); + $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() ); + $this->assertSame( 'html', $fragment->get_namespace() ); + $this->assertTrue( $fragment->has_self_closing_flag() ); + $this->assertTrue( $fragment->expects_closer() ); + } + /** * Ensure that lowercased tag_name query matches tags case-insensitively. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 808fa39d17f26..7abe63a859954 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -138,10 +138,6 @@ public function data_external_html5lib_tests() { * @return bool True if the test case should be skipped. False otherwise. */ private static function should_skip_test( ?string $test_context_element, string $test_name ): bool { - if ( null !== $test_context_element && 'body' !== $test_context_element ) { - return true; - } - if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) { return true; } @@ -157,18 +153,79 @@ private static function should_skip_test( ?string $test_context_element, string * @return string|null Tree structure of parsed HTML, if supported, else null. */ private static function build_tree_representation( ?string $fragment_context, string $html ) { - $processor = $fragment_context - ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) - : WP_HTML_Processor::create_full_parser( $html ); - if ( null === $processor ) { - throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); + $processor = null; + if ( $fragment_context ) { + if ( 'body' === $fragment_context ) { + $processor = WP_HTML_Processor::create_fragment( $html ); + } else { + + /* + * If the string of characters starts with "svg ", the context + * element is in the SVG namespace and the substring after + * "svg " is the local name. If the string of characters starts + * with "math ", the context element is in the MathML namespace + * and the substring after "math " is the local name. + * Otherwise, the context element is in the HTML namespace and + * the string is the local name. + */ + if ( str_starts_with( $fragment_context, 'svg ' ) ) { + $tag_name = substr( $fragment_context, 4 ); + if ( 'svg' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { + $tag_name = substr( $fragment_context, 5 ); + if ( 'math' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } else { + if ( in_array( + $fragment_context, + array( + 'caption', + 'col', + 'colgroup', + 'tbody', + 'td', + 'tfoot', + 'th', + 'thead', + 'tr', + ), + true + ) ) { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + $parent_processor->next_tag(); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + } + $parent_processor->next_tag( $fragment_context ); + } + if ( null !== $parent_processor->get_unsupported_exception() ) { + throw $parent_processor->get_unsupported_exception(); + } + if ( null !== $parent_processor->get_last_error() ) { + throw new Exception( $parent_processor->get_last_error() ); + } + $processor = $parent_processor->create_fragment_at_current_node( $html ); + } + + if ( null === $processor ) { + throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); + } + } else { + $processor = WP_HTML_Processor::create_full_parser( $html ); + if ( null === $processor ) { + throw new Exception( 'Could not create a full parser.' ); + } } - /* - * The fragment parser will start in 2 levels deep at: html > body > [position] - * and requires adjustment to initial parameters. - * The full parser will not. - */ $output = ''; $indent_level = 0; $was_text = null;