From d456844b2404b880a542a9b013f0405bade48af7 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 10 Sep 2024 18:08:48 -0700 Subject: [PATCH 1/5] WIP: HTML API: Add `set_inner_html()` to HTML Processor This method depends on other methods, but this change introduces an idea for how to accomplish setting the inner HTML, leaving supporting details unresolved. --- .../html-api/class-wp-html-processor.php | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 0ca7a52a2a329..09c8976c2dc59 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5024,6 +5024,66 @@ public function get_comment_type(): ?string { return $this->is_virtual() ? null : parent::get_comment_type(); } + /** + * Replaces the inner markup of the currently-matched tag with provided HTML. + * + * This function will normalize the given input and enforce the boundaries + * within the existing HTML where it's called. + * + * @since 6.8.0 + * + * @param string $new_inner_html New HTML to inject as inner HTML for the currently-matched tag. + * @return bool Whether the inner markup was modified for the currently-matched tag, or `NULL` + * if called on a node which doesn't allow changing the inner HTML. + */ + public function set_inner_html( string $new_inner_html ): ?bool { + $tag_name = $this->get_tag(); + + if ( + WP_HTML_Tag_Processor::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_tag_closer() || + ( 'html' === $this->get_namespace() && + ( + self::is_void( $tag_name ) || + in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ) + ) + ) { + // @todo Support setting inner HTML for SCRIPT, STYLE, TEXTAREA, and TITLE. + return null; + } + + $fragment = $this->spawn_fragment_parser( $new_inner_html ); + $new_markup = $fragment->serialize(); + + $this->set_bookmark( 'start' ); + $depth = $this->get_current_depth(); + while ( $this->get_current_depth() >= $depth && $this->next_token() ) { + continue; + } + + if ( + $this->paused_at_incomplete_token() || + null !== $this->get_last_error() + ) { + return false; + } + + $this->set_bookmark( 'end' ); + $start = $this->bookmarks['_start']; + $end = $this->bookmarks['_end']; + + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $start->start + $start->length, + $end->start - ( $start->start + $start->length ), + $new_markup + ); + + $this->get_updated_html(); + $this->seek( 'start' ); + return true; + } + /** * Removes a bookmark that is no longer needed. * From 7e97aedebc97554c9be8e8bce88ea6e215a101c8 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 06:08:53 -0700 Subject: [PATCH 2/5] Add normalization and serialization methods. --- .../html-api/class-wp-html-processor.php | 82 +++++++++++++++++++ .../html-api/class-wp-html-tag-processor.php | 2 +- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 09c8976c2dc59..3914428040153 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5024,6 +5024,88 @@ public function get_comment_type(): ?string { return $this->is_virtual() ? null : parent::get_comment_type(); } + /** + * Normalize an HTML string by serializing it. + * + * @since 6.7.0 + * + * @param string $html Input HTML to normalize. + * @return string|null Normalized output, or `null` if unable to normalize. + */ + public static function normalize( string $html ): ?string { + return static::create_fragment( $html )->serialize(); + } + + /** + * Generate normalized markup for the HTML in the provided processor. + * + * @since 6.7.0 + * + * @return string|null Normalized HTML markup represented by processor, + * or `null` if unable to generate serialization. + */ + public function serialize(): ?string { + if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { + return null; + } + + $html = ''; + while ( $this->next_token() ) { + $token_type = $this->get_token_type(); + + switch ( $token_type ) { + case '#text': + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + break; + + case '#funky-comment': + case '#comment': + $html .= ""; + break; + + case '#cdata-section': + $html .= "get_modifiable_text()}]]>"; + break; + + case 'html': + $html .= ''; + break; + } + + if ( '#tag' !== $token_type ) { + continue; + } + + if ( $this->is_tag_closer() ) { + $html .= "get_qualified_tag_name()}>"; + continue; + } + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( ! isset( $attribute_names ) ) { + $html .= "<{$this->get_qualified_tag_name()}>"; + continue; + } + + $html .= "<{$this->get_qualified_tag_name()}"; + foreach ( $attribute_names as $attribute_name ) { + $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $value = $this->get_attribute( $attribute_name ); + + if ( is_string( $value ) ) { + $html .= '"' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + } + } + $html .= '>'; + } + + if ( $this->paused_at_incomplete_token() || null !== $this->get_last_error() ) { + return null; + } + + return $html; + } + /** * Replaces the inner markup of the currently-matched tag with provided HTML. * diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 233d47eb8da95..355a7bb001923 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2840,7 +2840,7 @@ public function get_qualified_tag_name(): ?string { } if ( 'html' === $this->get_namespace() ) { - return $tag_name; + return strtolower( $tag_name ); } $lower_tag_name = strtolower( $tag_name ); From 945c817be279ec8928f898703467354a17dd72a1 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 06:14:41 -0700 Subject: [PATCH 3/5] Docs and missing attribute equals sign --- src/wp-includes/html-api/class-wp-html-processor.php | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 3914428040153..a95dbfd815457 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5027,9 +5027,12 @@ public function get_comment_type(): ?string { /** * Normalize an HTML string by serializing it. * + * This removes any partial syntax at the end of the string. + * * @since 6.7.0 * * @param string $html Input HTML to normalize. + * * @return string|null Normalized output, or `null` if unable to normalize. */ public static function normalize( string $html ): ?string { @@ -5039,6 +5042,8 @@ public static function normalize( string $html ): ?string { /** * Generate normalized markup for the HTML in the provided processor. * + * This removes any partial syntax at the end of the string. + * * @since 6.7.0 * * @return string|null Normalized HTML markup represented by processor, @@ -5093,13 +5098,13 @@ public function serialize(): ?string { $value = $this->get_attribute( $attribute_name ); if ( is_string( $value ) ) { - $html .= '"' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; } } $html .= '>'; } - if ( $this->paused_at_incomplete_token() || null !== $this->get_last_error() ) { + if ( null !== $this->get_last_error() ) { return null; } From 93b48e7fa735b9ab3ae7114d4c588744b6fbf665 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 06:23:50 -0700 Subject: [PATCH 4/5] Fix wrong IMAGE > IMG assignment in foreign content. --- .../html-api/class-wp-html-processor.php | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index a95dbfd815457..c50032829f63f 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -4727,17 +4727,13 @@ public function get_tag(): ?string { $tag_name = parent::get_tag(); - switch ( $tag_name ) { - case 'IMAGE': - /* - * > A start tag whose tag name is "image" - * > Change the token's tag name to "img" and reprocess it. (Don't ask.) - */ - return 'IMG'; - - default: - return $tag_name; - } + /* + * > A start tag whose tag name is "image" + * > Change the token's tag name to "img" and reprocess it. (Don't ask.) + */ + return ( 'IMAGE' === $tag_name && 'html' === $this->get_namespace() ) + ? 'IMG' + : $tag_name; } /** @@ -5101,6 +5097,11 @@ public function serialize(): ?string { $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; } } + + if ( 'html' !== $this->get_namespace() && $this->has_self_closing_flag() ) { + $html .= '/'; + } + $html .= '>'; } From c621467041307f073457779aa21a6737196befcc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 12 Sep 2024 17:53:03 +0200 Subject: [PATCH 5/5] Add spawn_fragment_parser method --- .../html-api/class-wp-html-processor.php | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c50032829f63f..ed68efc52979c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -422,6 +422,55 @@ function ( WP_HTML_Token $token ): void { }; } + /** + * Creates a fragment processor with the current node as its context element. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm + * + * @param string $html Input HTML fragment to process. + * @return static|null The created processor if successful, otherwise null. + */ + private function spawn_fragment_parser( string $html ): ?self { + if ( $this->get_token_type() !== '#tag' ) { + return null; + } + + /* + * Prevent creating fragments at "self-contained" nodes. + * + * @see https://github.com/WordPress/wordpress-develop/pull/7141 + * @see https://github.com/WordPress/wordpress-develop/pull/7198 + */ + if ( + 'html' === $this->get_namespace() && + in_array( $this->get_tag(), array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ) { + return null; + } + + $fragment_processor = self::create_fragment( $html ); + $fragment_processor->compat_mode = $this->compat_mode; + + // @todo The context element probably needs a namespace{ + $context_element = array( $this->get_tag(), array() ); + foreach ( $this->get_attribute_names_with_prefix( '' ) as $name => $value ) { + $context_element[1][ $name ] = $value; + } + $fragment_processor->state->context_node = $context_element; + + if ( 'TEMPLATE' === $context_element[0] ) { + $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + } + + $fragment_processor->reset_insertion_mode_appropriately(); + + // @todo Set the parser's form element pointer. + + $fragment_processor->state->encoding_confidence = 'irrelevant'; + + return $fragment_processor; + } + /** * Stops the parser and terminates its execution when encountering unsupported markup. *