diff --git a/src/wp-includes/html-api/class-wp-html-processor-state.php b/src/wp-includes/html-api/class-wp-html-processor-state.php index e0469bea020e5..97f6da95a0012 100644 --- a/src/wp-includes/html-api/class-wp-html-processor-state.php +++ b/src/wp-includes/html-api/class-wp-html-processor-state.php @@ -428,6 +428,38 @@ class WP_HTML_Processor_State { */ public $context_node = null; + /** + * The recognized encoding of the input byte stream. + * + * > The stream of code points that comprises the input to the tokenization + * > stage will be initially seen by the user agent as a stream of bytes + * > (typically coming over the network or from the local file system). + * > The bytes encode the actual characters according to a particular character + * > encoding, which the user agent uses to decode the bytes into characters. + * + * @since 6.7.0 + * + * @var string|null + */ + public $encoding = null; + + /** + * The parser's confidence in the input encoding. + * + * > When the HTML parser is decoding an input byte stream, it uses a character + * > encoding and a confidence. The confidence is either tentative, certain, or + * > irrelevant. The encoding used, and whether the confidence in that encoding + * > is tentative or certain, is used during the parsing to determine whether to + * > change the encoding. If no encoding is necessary, e.g. because the parser is + * > operating on a Unicode stream and doesn't have to use a character encoding + * > at all, then the confidence is irrelevant. + * + * @since 6.7.0 + * + * @var string + */ + public $encoding_confidence = 'tentative'; + /** * HEAD element pointer. * diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 9f2662c9e4c48..51802ac558a60 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -256,21 +256,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $context_node = null; - /** - * Whether the parser has yet processed the context node, - * if created as a fragment parser. - * - * The context node will be initially pushed onto the stack of open elements, - * but when created as a fragment parser, this context element (and the implicit - * HTML document node above it) should not be exposed as a matched token or node. - * - * This boolean indicates whether the processor should skip over the current - * node in its initial search for the first node created from the input HTML. - * - * @var bool - */ - private $has_seen_context_node = false; - /* * Public Interface Functions */ @@ -312,9 +297,11 @@ public static function create_fragment( $html, $context = '', $encoding = return null; } - $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); - $processor->state->context_node = array( 'BODY', array() ); - $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor->state->context_node = array( 'BODY', array() ); + $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $processor->state->encoding = $encoding; + $processor->state->encoding_confidence = 'certain'; // @todo Create "fake" bookmarks for non-existent but implied nodes. $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); @@ -340,6 +327,34 @@ public static function create_fragment( $html, $context = '', $encoding = return $processor; } + /** + * Creates an HTML processor in the full parsing mode. + * + * It's likely that a fragment parser is more appropriate, unless sending an + * entire HTML document from start to finish. Consider a fragment parser with + * a context node of ``. + * + * Since UTF-8 is the only currently-accepted charset, if working with a + * document that isn't UTF-8, it's important to convert the document before + * creating the processor: pass in the converted HTML. + * + * @param string $html Input HTML document to process. + * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used + * in the input byte stream. Currently must be UTF-8. + * @return static|null The created processor if successful, otherwise null. + */ + public static function create_full_parser( $html, $known_definite_encoding = 'UTF-8' ) { + if ( 'UTF-8' !== $known_definite_encoding ) { + return null; + } + + $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor->state->encoding = $known_definite_encoding; + $processor->state->encoding_confidence = 'certain'; + + return $processor; + } + /** * Constructor. * @@ -993,7 +1008,62 @@ public function get_current_depth(): int { * @return bool Whether an element was found. */ private function step_initial(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_INITIAL . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step(); + } + goto initial_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + $contents = $this->get_modifiable_text(); + if ( ' html' !== $contents ) { + /* + * @todo When the HTML Tag Processor fully parses the DOCTYPE declaration, + * this code should examine the contents to set the compatability mode. + */ + $this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' ); + } + + /* + * > Then, switch the insertion mode to "before html". + */ + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; + return true; + } + + /* + * > Anything else + */ + initial_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -1002,7 +1072,7 @@ private function step_initial(): bool { * This internal function performs the 'before html' insertion mode * logic for the generalized WP_HTML_Processor::step() function. * - * @since 6.7.0 Stub implementation. + * @since 6.7.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * @@ -1012,7 +1082,86 @@ private function step_initial(): bool { * @return bool Whether an element was found. */ private function step_before_html(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step(); + } + goto before_html_anything_else; + break; + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; + return true; + + /* + * > An end tag whose tag name is one of: "head", "body", "html", "br" + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '-HEAD': + case '-BODY': + case '-HTML': + /* + * > Act as described in the "anything else" entry below. + */ + goto before_html_anything_else; + break; + } + + /* + * > Any other end tag + */ + if ( $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else. + * + * > Create an html element whose node document is the Document object. + * > Append it to the Document object. Put this element in the stack of open elements. + * > Switch the insertion mode to "before head", then reprocess the token. + */ + before_html_anything_else: + $this->insert_virtual_node( 'HTML' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -1031,7 +1180,86 @@ private function step_before_html(): bool { * @return bool Whether an element was found. */ private function step_before_head(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step(); + } + goto before_head_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "head" + */ + case '+HEAD': + $this->insert_html_element( $this->state->current_token ); + $this->state->head_element = $this->state->current_token; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return true; + + /* + * > An end tag whose tag name is one of: "head", "body", "html", "br" + * > Act as described in the "anything else" entry below. + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '-HEAD': + case '-BODY': + case '-HTML': + goto before_head_anything_else; + break; + } + + if ( $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * + * > Insert an HTML element for a "head" start tag token with no attributes. + */ + before_head_anything_else: + $this->state->head_element = $this->insert_virtual_node( 'HEAD' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -1056,29 +1284,31 @@ private function step_in_head(): bool { $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; $op = "{$op_sigil}{$token_name}"; - /* - * > A character token that is one of U+0009 CHARACTER TABULATION, - * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), - * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE - */ - if ( '#text' === $op ) { - $text = $this->get_modifiable_text(); - if ( '' === $text ) { + switch ( $op ) { + case '#text': /* - * If the text is empty after processing HTML entities and stripping - * U+0000 NULL bytes then ignore the token. + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ - return $this->step(); - } + $text = $this->get_modifiable_text(); + if ( '' === $text ) { + /* + * If the text is empty after processing HTML entities and stripping + * U+0000 NULL bytes then ignore the token. + */ + return $this->step(); + } - if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { - // Insert the character. - $this->insert_html_element( $this->state->current_token ); - return true; - } - } + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + // Insert the character. + $this->insert_html_element( $this->state->current_token ); + return true; + } + + goto in_head_anything_else; + break; - switch ( $op ) { /* * > A comment token */ @@ -1124,7 +1354,7 @@ private function step_in_head(): bool { * > tentative, then change the encoding to the resulting encoding. */ $charset = $this->get_attribute( 'charset' ); - if ( is_string( $charset ) ) { + if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) { $this->bail( 'Cannot yet process META tags with charset to determine encoding.' ); } @@ -1141,7 +1371,8 @@ private function step_in_head(): bool { if ( is_string( $http_equiv ) && is_string( $content ) && - 0 === strcasecmp( $http_equiv, 'Content-Type' ) + 0 === strcasecmp( $http_equiv, 'Content-Type' ) && + 'tentative' === $this->state->encoding_confidence ) { $this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' ); } @@ -1193,10 +1424,11 @@ private function step_in_head(): bool { /* * > An end tag whose tag name is one of: "body", "html", "br" + * + * BR tags are always reported by the Tag Processor as opening tags. */ case '-BODY': case '-HTML': - case '-BR': /* * > Act as described in the "anything else" entry below. */ @@ -1273,7 +1505,92 @@ private function step_in_head(): bool { * @return bool Whether an element was found. */ private function step_in_head_noscript(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_head(); + } + + goto in_head_noscript_anything_else; + break; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "noscript" + */ + case '-NOSCRIPT': + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return true; + + /* + * > A comment token + * > + * > A start tag whose tag name is one of: "basefont", "bgsound", + * > "link", "meta", "noframes", "style" + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+STYLE': + return $this->step_in_head(); + + /* + * > An end tag whose tag name is "br" + * + * This should never happen, as the Tag Processor prevents showing a BR closing tag. + */ + } + + /* + * > A start tag whose tag name is one of: "head", "noscript" + * > Any other end tag + */ + if ( '+HEAD' === $op || '+NOSCRIPT' === $op || $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * + * Anything here is a parse error. + */ + in_head_noscript_anything_else: + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -1292,7 +1609,133 @@ private function step_in_head_noscript(): bool { * @return bool Whether an element was found. */ private function step_after_head(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + // Insert the character. + $this->insert_html_element( $this->state->current_token ); + return true; + } + goto after_head_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "body" + */ + case '+BODY': + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return true; + + /* + * > A start tag whose tag name is "frameset" + */ + case '+FRAMESET': + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET; + return true; + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", + * > "link", "meta", "noframes", "script", "style", "template", "title" + * + * Anything here is a parse error. + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+SCRIPT': + case '+STYLE': + case '+TEMPLATE': + case '+TITLE': + /* + * > Push the node pointed to by the head element pointer onto the stack of open elements. + * > Process the token using the rules for the "in head" insertion mode. + * > Remove the node pointed to by the head element pointer from the stack of open elements. (It might not be the current node at this point.) + */ + $this->bail( 'Cannot process elements after HEAD which reopen the HEAD element.' ); + /* + * Do not leave this break in when adding support; it's here to prevent + * WPCS from getting confused at the switch structure without a return, + * because it doesn't know that `bail()` always throws. + */ + break; + + /* + * > An end tag whose tag name is "template" + */ + case '-TEMPLATE': + return $this->step_in_head(); + + /* + * > An end tag whose tag name is one of: "body", "html", "br" + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '-BODY': + case '-HTML': + /* + * > Act as described in the "anything else" entry below. + */ + goto after_head_anything_else; + break; + } + + /* + * > A start tag whose tag name is "head" + * > Any other end tag + */ + if ( '+HEAD' === $op || $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * > Insert an HTML element for a "body" start tag token with no attributes. + */ + after_head_anything_else: + $this->insert_virtual_node( 'BODY' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -4469,14 +4912,17 @@ private function insert_html_element( WP_HTML_Token $token ): void { * @param string $token_name Name of token to create and insert into the stack of open elements. * @param string|null $bookmark_name Optional. Name to give bookmark for created virtual node. * Defaults to auto-creating a bookmark name. + * @return WP_HTML_Token Newly-created virtual token. */ - private function insert_virtual_node( $token_name, $bookmark_name = null ): void { + private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_HTML_Token { $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; $name = $bookmark_name ?? $this->bookmark_token(); $this->bookmarks[ $name ] = new WP_HTML_Span( $here->start, 0 ); - $this->insert_html_element( new WP_HTML_Token( $name, $token_name, false ) ); + $token = new WP_HTML_Token( $name, $token_name, false ); + $this->insert_html_element( $token ); + return $token; } /* @@ -4633,6 +5079,53 @@ public static function is_void( $tag_name ): bool { ); } + /** + * Gets an encoding from a given string. + * + * This is an algorithm defined in the WHAT-WG specification. + * + * Example: + * + * 'UTF-8' === self::get_encoding( 'utf8' ); + * 'UTF-8' === self::get_encoding( " \tUTF-8 " ); + * null === self::get_encoding( 'UTF-7' ); + * null === self::get_encoding( 'utf8; charset=' ); + * + * @see https://encoding.spec.whatwg.org/#concept-encoding-get + * + * @todo As this parser only supports UTF-8, only the UTF-8 + * encodings are detected. Add more as desired, but the + * parser will bail on non-UTF-8 encodings. + * + * @since 6.7.0 + * + * @param string $label A string which may specify a known encoding. + * @return string|null Known encoding if matched, otherwise null. + */ + protected static function get_encoding( string $label ): ?string { + /* + * > Remove any leading and trailing ASCII whitespace from label. + */ + $label = trim( $label, " \t\f\r\n" ); + + /* + * > If label is an ASCII case-insensitive match for any of the labels listed in the + * > table below, then return the corresponding encoding; otherwise return failure. + */ + switch ( strtolower( $label ) ) { + case 'unicode-1-1-utf-8': + case 'unicode11utf8': + case 'unicode20utf8': + case 'utf-8': + case 'utf8': + case 'x-unicode20utf8': + return 'UTF-8'; + + default: + return null; + } + } + /* * Constants that would pollute the top of the class if they were found there. */ diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index 0dbd45cfa0ead..1486769533e96 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -25,7 +25,7 @@ class Tests_HtmlApi_WpHtmlProcessorBreadcrumbs extends WP_UnitTestCase { public function test_navigates_into_normative_html_for_supported_elements( $html, $tag_name ) { $processor = WP_HTML_Processor::create_fragment( $html ); - $this->assertTrue( $processor->step(), "Failed to step into supported {$tag_name} element." ); + $this->assertTrue( $processor->next_token(), "Failed to step into supported {$tag_name} element." ); $this->assertSame( $tag_name, $processor->get_tag(), "Misread {$tag_name} as a {$processor->get_tag()} element." ); } @@ -90,6 +90,7 @@ public static function data_single_tag_of_supported_elements() { 'IMG', 'INS', 'LI', + 'LINK', 'ISINDEX', // Deprecated. 'KBD', 'KEYGEN', // Deprecated. @@ -108,6 +109,8 @@ public static function data_single_tag_of_supported_elements() { 'NAV', 'NEXTID', // Deprecated. 'NOBR', // Neutralized. + 'NOEMBED', // Neutralized. + 'NOFRAMES', // Neutralized. 'NOSCRIPT', 'OBJECT', 'OL', @@ -122,6 +125,7 @@ public static function data_single_tag_of_supported_elements() { 'RTC', // Neutralized. 'RUBY', 'SAMP', + 'SCRIPT', 'SEARCH', 'SECTION', 'SLOT', @@ -130,21 +134,29 @@ public static function data_single_tag_of_supported_elements() { 'SPAN', 'STRIKE', 'STRONG', + 'STYLE', 'SUB', 'SUMMARY', 'SUP', 'TABLE', + 'TEXTAREA', 'TIME', + 'TITLE', 'TT', 'U', 'UL', 'VAR', 'VIDEO', + 'XMP', // Deprecated, use PRE instead. ); $data = array(); foreach ( $supported_elements as $tag_name ) { - $data[ $tag_name ] = array( "<{$tag_name}>", $tag_name ); + $closer = in_array( $tag_name, array( 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ? "" + : ''; + + $data[ $tag_name ] = array( "<{$tag_name}>{$closer}", $tag_name ); } $data['IMAGE (treated as an IMG)'] = array( '', 'IMG' ); @@ -182,22 +194,9 @@ public function test_fails_when_encountering_unsupported_tag( $html ) { */ public static function data_unsupported_elements() { $unsupported_elements = array( - 'BODY', - 'FRAME', - 'FRAMESET', - 'HEAD', - 'HTML', - 'IFRAME', 'MATH', - 'NOEMBED', // Neutralized. - 'NOFRAMES', // Neutralized. 'PLAINTEXT', // Neutralized. - 'SCRIPT', - 'STYLE', 'SVG', - 'TEXTAREA', - 'TITLE', - 'XMP', // Deprecated, use PRE instead. ); $data = array(); diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 69329f51321ba..cc9528c3ff083 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -25,35 +25,41 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { * The HTML Processor only accepts HTML in document . * Do not run tests that look for anything in document . */ - const SKIP_HEAD_TESTS = true; + const SKIP_HEAD_TESTS = false; /** * Skip specific tests that may not be supported or have known issues. */ const SKIP_TESTS = array( - 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', - 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', - 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', - 'template/line0885' => 'Unimplemented: no parsing of attributes on context node.', - 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests15/line0068' => 'Unimplemented: no support outside of IN BODY yet.', - 'tests2/line0650' => 'Whitespace only test never enters "in body" parsing mode.', - 'tests19/line0965' => 'Unimplemented: no support outside of IN BODY yet.', - 'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests26/line0263' => 'Bug: An active formatting element should be created for a trailing text node.', - 'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'webkit02/line0013' => "Asserting behavior with scripting flag enabled, which this parser doesn't support.", - 'webkit01/line0300' => 'Unimplemented: no support outside of IN BODY yet.', - 'webkit01/line0310' => 'Unimplemented: no support outside of IN BODY yet.', - 'webkit01/line0336' => 'Unimplemented: no support outside of IN BODY yet.', - 'webkit01/line0349' => 'Unimplemented: no support outside of IN BODY yet.', - 'webkit01/line0362' => 'Unimplemented: no support outside of IN BODY yet.', - 'webkit01/line0375' => 'Unimplemented: no support outside of IN BODY yet.', + 'tests1/line0537' => 'Bug: Investigate', + + 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', + 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', + 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', + 'comments01/line0155' => 'Unimplemented: Need to access raw comment text on non-normative comments.', + 'comments01/line0169' => 'Unimplemented: Need to access raw comment text on non-normative comments.', + 'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.', + 'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'template/line0885' => 'Unimplemented: no parsing of attributes on context node.', + 'tests1/line0692' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly', + 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line0965' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.', + 'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests5/line0013' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.', + 'tests5/line0077' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.', + 'tests5/line0091' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly', + 'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', ); /** @@ -68,14 +74,43 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { * @param string $html Given test HTML. * @param string $expected_tree Tree structure of parsed HTML. */ - public function test_parse( $fragment_context, $html, $expected_tree ) { + public function test_parse( ?string $fragment_context, string $html, string $expected_tree ) { $processed_tree = self::build_tree_representation( $fragment_context, $html ); if ( null === $processed_tree ) { $this->markTestSkipped( 'Test includes unsupported markup.' ); } + $fragment_detail = $fragment_context ? " in context <{$fragment_context}>" : ''; - $this->assertSame( $expected_tree, $processed_tree, "HTML was not processed correctly:\n{$html}" ); + /* + * The HTML processor does not produce html, head, body tags if the processor does not reach them. + * These should all be produced when reaching the end-of-file. + * For now, append the missing tags when necessary. + * + * @todo remove this section when when the processor handles this. + */ + $auto_generated_html_head_body = "\n \n \n\n"; + $auto_generated_head_body = " \n \n\n"; + $auto_generated_body = " \n\n"; + if ( str_ends_with( $expected_tree, $auto_generated_html_head_body ) && ! str_ends_with( $processed_tree, $auto_generated_html_head_body ) ) { + if ( str_ends_with( $processed_tree, "\n \n\n" ) ) { + $processed_tree = substr_replace( $processed_tree, " \n\n", -1 ); + } elseif ( str_ends_with( $processed_tree, "\n\n" ) ) { + $processed_tree = substr_replace( $processed_tree, " \n \n\n", -1 ); + } else { + $processed_tree = substr_replace( $processed_tree, $auto_generated_html_head_body, -1 ); + } + } elseif ( str_ends_with( $expected_tree, $auto_generated_head_body ) && ! str_ends_with( $processed_tree, $auto_generated_head_body ) ) { + if ( str_ends_with( $processed_tree, "\n\n" ) ) { + $processed_tree = substr_replace( $processed_tree, " \n\n", -1 ); + } else { + $processed_tree = substr_replace( $processed_tree, $auto_generated_head_body, -1 ); + } + } elseif ( str_ends_with( $expected_tree, $auto_generated_body ) && ! str_ends_with( $processed_tree, $auto_generated_body ) ) { + $processed_tree = substr_replace( $processed_tree, $auto_generated_body, -1 ); + } + + $this->assertSame( $expected_tree, $processed_tree, "HTML was not processed correctly{$fragment_detail}:\n{$html}" ); } /** @@ -100,7 +135,9 @@ public function data_external_html5lib_tests() { $line = str_pad( strval( $test[0] ), 4, '0', STR_PAD_LEFT ); $test_name = "{$test_suite}/line{$line}"; - if ( self::should_skip_test( $test_name, $test[3] ) ) { + $test_context_element = $test[1]; + + if ( self::should_skip_test( $test_context_element, $test_name, $test[3] ) ) { continue; } @@ -118,7 +155,11 @@ public function data_external_html5lib_tests() { * * @return bool True if the test case should be skipped. False otherwise. */ - private static function should_skip_test( $test_name, $expected_tree ): bool { + private static function should_skip_test( ?string $test_context_element, string $test_name, string $expected_tree ): bool { + if ( null !== $test_context_element && 'body' !== $test_context_element ) { + return true; + } + if ( self::SKIP_HEAD_TESTS ) { $html_start = "\n \n \n"; if ( @@ -146,15 +187,18 @@ private static function should_skip_test( $test_name, $expected_tree ): bool { private static function build_tree_representation( ?string $fragment_context, string $html ) { $processor = $fragment_context ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) - : WP_HTML_Processor::create_fragment( $html ); + : WP_HTML_Processor::create_full_parser( $html ); if ( null === $processor ) { return null; } - $output = "\n \n \n"; - - // Initially, assume we're 2 levels deep at: html > body > [position] - $indent_level = 2; + /* + * The fragment parser will start in 2 levels deep at: html > body > [position] + * and requires adjustment to initial parameters. + * The full parser will not. + */ + $output = $fragment_context ? "\n \n \n" : ''; + $indent_level = $fragment_context ? 2 : 0; $indent = ' '; $was_text = null; $text_node = ''; @@ -238,6 +282,11 @@ private static function build_tree_representation( ?string $fragment_context, st $text_node .= $processor->get_modifiable_text(); break; + case '#funky-comment': + // Comments must be "<" then "!-- " then the data then " -->". + $output .= str_repeat( $indent, $indent_level ) . "\n"; + break; + case '#comment': switch ( $processor->get_comment_type() ) { case WP_HTML_Processor::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT: @@ -250,6 +299,10 @@ private static function build_tree_representation( ?string $fragment_context, st $comment_text_content = "[CDATA[{$processor->get_modifiable_text()}]]"; break; + case WP_HTML_Processor::COMMENT_AS_PI_NODE_LOOKALIKE: + $comment_text_content = "?{$processor->get_tag()}{$processor->get_modifiable_text()}?"; + break; + default: throw new Error( "Unhandled comment type for tree construction: {$processor->get_comment_type()}" ); } @@ -301,6 +354,7 @@ public static function parse_html5_dat_testfile( $filename ) { $test_html = ''; $test_dom = ''; $test_context_element = null; + $test_script_flag = false; $test_line_number = 0; while ( false !== ( $line = fgets( $handle ) ) ) { @@ -309,8 +363,12 @@ public static function parse_html5_dat_testfile( $filename ) { if ( '#' === $line[0] ) { // Finish section. if ( "#data\n" === $line ) { - // Yield when switching from a previous state. - if ( $state ) { + /* + * Yield when switching from a previous state. + * Do not yield tests with the scripting flag enabled. The scripting flag + * is always disabled in the HTML API. + */ + if ( $state && ! $test_script_flag ) { yield array( $test_line_number, $test_context_element, @@ -325,6 +383,10 @@ public static function parse_html5_dat_testfile( $filename ) { $test_html = ''; $test_dom = ''; $test_context_element = null; + $test_script_flag = false; + } + if ( "#script-on\n" === $line ) { + $test_script_flag = true; } $state = trim( substr( $line, 1 ) ); @@ -376,7 +438,15 @@ public static function parse_html5_dat_testfile( $filename ) { */ case 'document': if ( '|' === $line[0] ) { - $test_dom .= substr( $line, 2 ); + /* + * The next_token() method these tests rely on do not stop + * at doctype nodes. Strip doctypes from output. + * @todo Restore this line if and when the processor + * exposes doctypes. + */ + if ( '|