diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
index 48f7d7fe8c781..1be795c5c7de2 100644
--- a/src/wp-includes/html-api/class-wp-html-processor.php
+++ b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -279,51 +279,62 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* form is provided because a context element may have attributes that
* impact the parse, such as with a SCRIPT tag and its `type` attribute.
*
- * ## Current HTML Support
+ * Example:
+ *
+ * // Usually, snippets of HTML ought to be processed in the default `
` context:
+ * $processor = WP_HTML_Processor::create_fragment( '' );
+ *
+ * // This fragment with TD tags should be processed in a TR context:
+ * $processor = WP_HTML_Processor::create_fragment(
+ * '1 2 3',
+ * ''
+ * );
*
- * - The only supported context is ``, which is the default value.
- * - The only supported document encoding is `UTF-8`, which is the default value.
+ * In order to create a fragment processor at the correct location, the
+ * provided fragment will be processed as part of a full HTML document.
+ * The processor will search for the last opener tag in the document and
+ * create a fragment processor at that location. The document will be
+ * forced into "no-quirks" mode by including the HTML5 doctype.
+ *
+ * For advanced usage and precise control over the context element, use
+ * `WP_HTML_Processor::create_full_processor()` and
+ * `WP_HTML_Processor::create_fragment_at_current_node()`.
+ *
+ * UTF-8 is the only allowed encoding. If working with a document that
+ * isn't UTF-8, first convert the document to UTF-8, then pass in the
+ * converted HTML.
*
* @since 6.4.0
* @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances.
+ * @since 6.8.0 Can create fragments with any context element.
*
* @param string $html Input HTML fragment to process.
- * @param string $context Context element for the fragment, must be default of ``.
+ * @param string $context Context element for the fragment. Defaults to ``.
* @param string $encoding Text encoding of the document; must be default of 'UTF-8'.
* @return static|null The created processor if successful, otherwise null.
*/
public static function create_fragment( $html, $context = '', $encoding = 'UTF-8' ) {
- if ( '' !== $context || 'UTF-8' !== $encoding ) {
+ $context_processor = static::create_full_parser( "{$context}", $encoding );
+ if ( null === $context_processor ) {
return null;
}
- $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
- $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
- $processor->state->encoding = $encoding;
- $processor->state->encoding_confidence = 'certain';
-
- // @todo Create "fake" bookmarks for non-existent but implied nodes.
- $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
- $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
-
- $root_node = new WP_HTML_Token(
- 'root-node',
- 'HTML',
- false
- );
-
- $processor->state->stack_of_open_elements->push( $root_node );
-
- $context_node = new WP_HTML_Token(
- 'context-node',
- 'BODY',
- false
- );
+ while ( $context_processor->next_tag() ) {
+ $context_processor->set_bookmark( 'final_node' );
+ }
- $processor->context_node = $context_node;
- $processor->breadcrumbs = array( 'HTML', $context_node->node_name );
+ if (
+ ! $context_processor->has_bookmark( 'final_node' ) ||
+ ! $context_processor->seek( 'final_node' )
+ ) {
+ _doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' );
+ return null;
+ }
- return $processor;
+ return $context_processor->create_fragment_at_current_node( $html );
}
/**
@@ -333,9 +344,9 @@ public static function create_fragment( $html, $context = '', $encoding =
* entire HTML document from start to finish. Consider a fragment parser with
* a context node of ``.
*
- * Since UTF-8 is the only currently-accepted charset, if working with a
- * document that isn't UTF-8, it's important to convert the document before
- * creating the processor: pass in the converted HTML.
+ * UTF-8 is the only allowed encoding. If working with a document that
+ * isn't UTF-8, first convert the document to UTF-8, then pass in the
+ * converted HTML.
*
* @param string $html Input HTML document to process.
* @param string|null $known_definite_encoding Optional. If provided, specifies the charset used
@@ -459,35 +470,72 @@ function ( WP_HTML_Token $token ): void {
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
*
+ * @since 6.8.0
+ *
* @param string $html Input HTML fragment to process.
* @return static|null The created processor if successful, otherwise null.
*/
public function create_fragment_at_current_node( string $html ) {
if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'The context element must be a start tag.' ),
+ '6.8.0'
+ );
return null;
}
+ $tag_name = $this->current_element->token->node_name;
$namespace = $this->current_element->token->namespace;
+ if ( 'html' === $namespace && self::is_void( $tag_name ) ) {
+ _doing_it_wrong(
+ __METHOD__,
+ sprintf(
+ // translators: %s: A tag name like INPUT or BR.
+ __( 'The context element cannot be a void element, found "%s".' ),
+ $tag_name
+ ),
+ '6.8.0'
+ );
+ return null;
+ }
+
/*
* Prevent creating fragments at nodes that require a special tokenizer state.
* This is unsupported by the HTML Processor.
*/
if (
'html' === $namespace &&
- in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
+ in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
) {
+ _doing_it_wrong(
+ __METHOD__,
+ sprintf(
+ // translators: %s: A tag name like IFRAME or TEXTAREA.
+ __( 'The context element "%s" is not supported.' ),
+ $tag_name
+ ),
+ '6.8.0'
+ );
return null;
}
- $fragment_processor = static::create_fragment( $html );
- if ( null === $fragment_processor ) {
- return null;
- }
+ $fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
$fragment_processor->compat_mode = $this->compat_mode;
- $fragment_processor->context_node = clone $this->state->current_token;
+ // @todo Create "fake" bookmarks for non-existent but implied nodes.
+ $fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
+ $root_node = new WP_HTML_Token(
+ 'root-node',
+ 'HTML',
+ false
+ );
+ $fragment_processor->state->stack_of_open_elements->push( $root_node );
+
+ $fragment_processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
+ $fragment_processor->context_node = clone $this->current_element->token;
$fragment_processor->context_node->bookmark_name = 'context-node';
$fragment_processor->context_node->on_destroy = null;
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php
index f80260cbc1aa6..1ca60e691f03e 100644
--- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php
@@ -1043,83 +1043,6 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to
$this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) );
}
- /**
- * @ticket 62357
- */
- public function test_create_fragment_at_current_node_in_foreign_content() {
- $processor = WP_HTML_Processor::create_full_parser( '' );
- $this->assertTrue( $processor->next_tag( 'SVG' ) );
-
- $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" );
-
- $this->assertSame( 'svg', $fragment->get_namespace() );
- $this->assertTrue( $fragment->next_token() );
-
- /*
- * In HTML parsing, a nul byte would be ignored.
- * In SVG it should be replaced with a replacement character.
- */
- $this->assertSame( '#text', $fragment->get_token_type() );
- $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );
-
- $this->assertTrue( $fragment->next_tag( 'RECT' ) );
- $this->assertSame( 'svg', $fragment->get_namespace() );
-
- $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
- $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
- $this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
- $this->assertSame( 'svg', $fragment->get_namespace() );
- }
-
- /**
- * @ticket 62357
- */
- public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
- $processor = WP_HTML_Processor::create_full_parser( '' );
- $this->assertTrue( $processor->next_tag( 'foreignObject' ) );
-
- $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte " );
-
- // Nothing has been processed, the html namespace should be used for parsing as an integration point.
- $this->assertSame( 'html', $fragment->get_namespace() );
-
- // HTML parsing transforms IMAGE into IMG.
- $this->assertTrue( $fragment->next_tag( 'IMG' ) );
-
- $this->assertTrue( $fragment->next_token() );
-
- // In HTML parsing, the nul byte is ignored and the text is reached.
- $this->assertSame( '#text', $fragment->get_token_type() );
- $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );
-
- /*
- * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
- * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
- */
- $this->assertTrue( $fragment->next_tag( 'RECT' ) );
- $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
- $this->assertSame( 'html', $fragment->get_namespace() );
- $this->assertTrue( $fragment->has_self_closing_flag() );
- $this->assertTrue( $fragment->expects_closer() );
- }
-
- /**
- * @ticket 62357
- */
- public function test_prevent_fragment_creation_on_closers() {
- $processor = WP_HTML_Processor::create_full_parser( '
' );
- $processor->next_tag( 'P' );
- $processor->next_tag(
- array(
- 'tag_name' => 'P',
- 'tag_closers' => 'visit',
- )
- );
- $this->assertSame( 'P', $processor->get_tag() );
- $this->assertTrue( $processor->is_tag_closer() );
- $this->assertNull( $processor->create_fragment_at_current_node( 'fragment HTML ' ) );
- }
-
/**
* Ensure that lowercased tag_name query matches tags case-insensitively.
*
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php b/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php
new file mode 100644
index 0000000000000..4913fa07eb412
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php
@@ -0,0 +1,178 @@
+' );
+ $this->assertTrue( $processor->next_tag( 'SVG' ) );
+
+ $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" );
+
+ $this->assertSame( 'svg', $fragment->get_namespace() );
+ $this->assertTrue( $fragment->next_token() );
+
+ /*
+ * In HTML parsing, a nul byte would be ignored.
+ * In SVG it should be replaced with a replacement character.
+ */
+ $this->assertSame( '#text', $fragment->get_token_type() );
+ $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );
+
+ $this->assertTrue( $fragment->next_tag( 'RECT' ) );
+ $this->assertSame( 'svg', $fragment->get_namespace() );
+
+ $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
+ $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
+ $this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
+ $this->assertSame( 'svg', $fragment->get_namespace() );
+ }
+
+ /**
+ * @ticket 62357
+ */
+ public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
+ $processor = WP_HTML_Processor::create_full_parser( '' );
+ $this->assertTrue( $processor->next_tag( 'foreignObject' ) );
+
+ $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte " );
+
+ // Nothing has been processed, the html namespace should be used for parsing as an integration point.
+ $this->assertSame( 'html', $fragment->get_namespace() );
+
+ // HTML parsing transforms IMAGE into IMG.
+ $this->assertTrue( $fragment->next_tag( 'IMG' ) );
+
+ $this->assertTrue( $fragment->next_token() );
+
+ // In HTML parsing, the nul byte is ignored and the text is reached.
+ $this->assertSame( '#text', $fragment->get_token_type() );
+ $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );
+
+ /*
+ * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
+ * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
+ */
+ $this->assertTrue( $fragment->next_tag( 'RECT' ) );
+ $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
+ $this->assertSame( 'html', $fragment->get_namespace() );
+ $this->assertTrue( $fragment->has_self_closing_flag() );
+ $this->assertTrue( $fragment->expects_closer() );
+ }
+
+ /**
+ * @expectedIncorrectUsage WP_HTML_Processor::create_fragment_at_current_node
+ * @ticket 62357
+ */
+ public function test_prevent_fragment_creation_on_closers() {
+ $processor = WP_HTML_Processor::create_full_parser( '
' );
+ $processor->next_tag( 'P' );
+ $processor->next_tag(
+ array(
+ 'tag_name' => 'P',
+ 'tag_closers' => 'visit',
+ )
+ );
+ $this->assertSame( 'P', $processor->get_tag() );
+ $this->assertTrue( $processor->is_tag_closer() );
+ $this->assertNull( $processor->create_fragment_at_current_node( 'fragment HTML ' ) );
+ }
+
+ /**
+ * Verifies that the fragment parser doesn't allow invalid context nodes.
+ *
+ * This includes void elements and self-contained elements because they can
+ * contain no inner HTML. Operations on self-contained elements should occur
+ * through methods such as {@see WP_HTML_Tag_Processor::set_modifiable_text}.
+ *
+ * @ticket 62584
+ *
+ * @dataProvider data_invalid_fragment_contexts
+ *
+ * @param string $context Invalid context node for fragment parser.
+ */
+ public function test_rejects_invalid_fragment_contexts( string $context, string $doing_it_wrong_method_name ) {
+ $this->setExpectedIncorrectUsage( "WP_HTML_Processor::{$doing_it_wrong_method_name}" );
+ $this->assertNull(
+ WP_HTML_Processor::create_fragment( 'just a test', $context ),
+ "Should not have been able to create a fragment parser with context node {$context}"
+ );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[]
+ */
+ public static function data_invalid_fragment_contexts() {
+ return array(
+ /*
+ * Invalid contexts.
+ */
+ /*
+ * The text node is confused with a virtual body open tag.
+ * This should fail to set a bookmark in `create_fragment`
+ * but currently does not, it slips through and fails in
+ * `create_fragment_at_current_node`.
+ */
+ 'Invalid text' => array( 'just some text', 'create_fragment_at_current_node' ),
+ 'Invalid comment' => array( '', 'create_fragment' ),
+ 'Invalid closing' => array( '', 'create_fragment' ),
+ 'Invalid DOCTYPE' => array( '', 'create_fragment' ),
+ /*
+ * PLAINTEXT should appear in the unsupported elements, but at the
+ * moment it's completely unsupported by the processor so
+ * the context element cannot be found.
+ */
+ 'Unsupported PLAINTEXT' => array( '', 'create_fragment' ),
+
+ /*
+ * Invalid contexts.
+ */
+ 'AREA' => array( ' ', 'create_fragment_at_current_node' ),
+ 'BASE' => array( ' ', 'create_fragment_at_current_node' ),
+ 'BASEFONT' => array( ' ', 'create_fragment_at_current_node' ),
+ 'BGSOUND' => array( '', 'create_fragment_at_current_node' ),
+ 'BR' => array( ' ', 'create_fragment_at_current_node' ),
+ 'COL' => array( ' ', 'create_fragment_at_current_node' ),
+ 'EMBED' => array( '', 'create_fragment_at_current_node' ),
+ 'FRAME' => array( ' ', 'create_fragment_at_current_node' ),
+ 'HR' => array( ' ', 'create_fragment_at_current_node' ),
+ 'IMG' => array( ' ', 'create_fragment_at_current_node' ),
+ 'INPUT' => array( ' ', 'create_fragment_at_current_node' ),
+ 'KEYGEN' => array( '', 'create_fragment_at_current_node' ),
+ 'LINK' => array( ' ', 'create_fragment_at_current_node' ),
+ 'META' => array( ' ', 'create_fragment_at_current_node' ),
+ 'PARAM' => array( ' ', 'create_fragment_at_current_node' ),
+ 'SOURCE' => array( '', 'create_fragment_at_current_node' ),
+ 'TRACK' => array( '', 'create_fragment_at_current_node' ),
+ 'WBR' => array( '', 'create_fragment_at_current_node' ),
+
+ /*
+ * Unsupported elements. Include a tag closer to ensure the element can be found
+ * and does not pause the parser at an incomplete token.
+ */
+ 'IFRAME' => array( '', 'create_fragment_at_current_node' ),
+ 'NOEMBED' => array( ' ', 'create_fragment_at_current_node' ),
+ 'NOFRAMES' => array( ' ', 'create_fragment_at_current_node' ),
+ 'SCRIPT' => array( '', 'create_fragment_at_current_node' ),
+ 'SCRIPT with type' => array( '', 'create_fragment_at_current_node' ),
+ 'STYLE' => array( '', 'create_fragment_at_current_node' ),
+ 'TEXTAREA' => array( '', 'create_fragment_at_current_node' ),
+ 'TITLE' => array( ' ', 'create_fragment_at_current_node' ),
+ 'XMP' => array( ' ', 'create_fragment_at_current_node' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
index 7abe63a859954..5e0c3b77f8732 100644
--- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
@@ -153,69 +153,55 @@ private static function should_skip_test( ?string $test_context_element, string
* @return string|null Tree structure of parsed HTML, if supported, else null.
*/
private static function build_tree_representation( ?string $fragment_context, string $html ) {
- $processor = null;
if ( $fragment_context ) {
- if ( 'body' === $fragment_context ) {
- $processor = WP_HTML_Processor::create_fragment( $html );
- } else {
-
- /*
- * If the string of characters starts with "svg ", the context
- * element is in the SVG namespace and the substring after
- * "svg " is the local name. If the string of characters starts
- * with "math ", the context element is in the MathML namespace
- * and the substring after "math " is the local name.
- * Otherwise, the context element is in the HTML namespace and
- * the string is the local name.
- */
- if ( str_starts_with( $fragment_context, 'svg ' ) ) {
- $tag_name = substr( $fragment_context, 4 );
- if ( 'svg' === $tag_name ) {
- $parent_processor = WP_HTML_Processor::create_full_parser( '' );
- } else {
- $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" );
- }
- $parent_processor->next_tag( $tag_name );
- } elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
- $tag_name = substr( $fragment_context, 5 );
- if ( 'math' === $tag_name ) {
- $parent_processor = WP_HTML_Processor::create_full_parser( '' );
- } else {
- $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" );
- }
- $parent_processor->next_tag( $tag_name );
+ /*
+ * If the string of characters starts with "svg ", the context
+ * element is in the SVG namespace and the substring after
+ * "svg " is the local name. If the string of characters starts
+ * with "math ", the context element is in the MathML namespace
+ * and the substring after "math " is the local name.
+ * Otherwise, the context element is in the HTML namespace and
+ * the string is the local name.
+ */
+ if ( str_starts_with( $fragment_context, 'svg ' ) ) {
+ $tag_name = substr( $fragment_context, 4 );
+ if ( 'svg' === $tag_name ) {
+ $fragment_context_html = '';
} else {
- if ( in_array(
- $fragment_context,
- array(
- 'caption',
- 'col',
- 'colgroup',
- 'tbody',
- 'td',
- 'tfoot',
- 'th',
- 'thead',
- 'tr',
- ),
- true
- ) ) {
- $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" );
- $parent_processor->next_tag();
- } else {
- $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" );
- }
- $parent_processor->next_tag( $fragment_context );
+ $fragment_context_html = "<{$tag_name}>";
}
- if ( null !== $parent_processor->get_unsupported_exception() ) {
- throw $parent_processor->get_unsupported_exception();
+ } elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
+ $tag_name = substr( $fragment_context, 5 );
+ if ( 'math' === $tag_name ) {
+ $fragment_context_html = '';
+ } else {
+ $fragment_context_html = "<{$tag_name}>";
}
- if ( null !== $parent_processor->get_last_error() ) {
- throw new Exception( $parent_processor->get_last_error() );
+ } else {
+ // Tags that only appear in tables need a special case.
+ if ( in_array(
+ $fragment_context,
+ array(
+ 'caption',
+ 'col',
+ 'colgroup',
+ 'tbody',
+ 'td',
+ 'tfoot',
+ 'th',
+ 'thead',
+ 'tr',
+ ),
+ true
+ ) ) {
+ $fragment_context_html = "<{$fragment_context}>";
+ } else {
+ $fragment_context_html = "<{$fragment_context}>";
}
- $processor = $parent_processor->create_fragment_at_current_node( $html );
}
+ $processor = WP_HTML_Processor::create_fragment( $html, $fragment_context_html );
+
if ( null === $processor ) {
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
}