diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index acef03f6a320a..8ff2461eb7ccb 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1061,14 +1061,25 @@ public function get_current_depth(): int { * echo WP_HTML_Processor::normalize( ' syntax < <> "oddities"' ); * // syntax < <> "oddities" * - * @since 6.7.0 + * @see static::serialize * - * @param string $html Input HTML to normalize. + * @since 6.7.0 * + * @param string $html Input HTML to normalize. + * @param array[]|null $allowed_html Optional. An array of allowed HTML and attributes, + * where each array key is an element name and each value + * is either an array specifying allowed attribute names + * and whether they are required, or `'remove-node'` to + * skip the entire node and its contents. + * For non-HTML elements, prefix the namespace followed by + * a space for the tag name. + * Defaults to allow all tags. + * @param array|null $allowed_protocols Optional. Array of allowable URL protocols. + * Defaults to allowing all URLs. * @return string|null Normalized output, or `null` if unable to normalize. */ - public static function normalize( string $html ): ?string { - return static::create_fragment( $html )->serialize(); + public static function normalize( string $html, array $allowed_html = null, array $allowed_protocols = null ): ?string { + return static::create_fragment( $html )->serialize( $allowed_html, $allowed_protocols ); } /** @@ -1107,10 +1118,20 @@ public static function normalize( string $html ): ?string { * * @since 6.7.0 * + * @param array[]|null $allowed_html Optional. An array of allowed HTML and attributes, + * where each array key is an element name and each value + * is either an array specifying allowed attribute names + * and whether they are required, or `'remove-node'` to + * skip the entire node and its contents. + * For non-HTML elements, prefix the namespace followed by + * a space for the tag name. + * Defaults to allow all tags. + * @param array|null $allowed_protocols Optional. Array of allowable URL protocols. + * Defaults to allowing all URLs. * @return string|null Normalized HTML markup represented by processor, * or `null` if unable to generate serialization. */ - public function serialize(): ?string { + public function serialize( array $allowed_html = null, array $allowed_protocols = null ): ?string { if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { wp_trigger_error( __METHOD__, @@ -1120,13 +1141,55 @@ public function serialize(): ?string { return null; } + $allowed_tags = array(); + $removed_tags = array(); + $required_attributes = array(); + $skip_closers = array(); + if ( is_array( $allowed_html ) ) { + foreach ( $allowed_html as $tag_name => $attributes ) { + if ( 0 === strlen( $tag_name ) ) { + continue; + } + + $tag_name = '#' === $tag_name[0] ? $tag_name : strtoupper( $tag_name ); + + if ( 'remove-node' === $attributes ) { + $removed_tags[] = $tag_name; + continue; + } + + if ( isset( $allowed_tags[ $tag_name ] ) ) { + _doing_it_wrong( + __METHOD__, + 'Only pass a single entry for each allowable tag name.', + '6.7.0' + ); + continue; + } + + $allowed_tags[ $tag_name ] = array(); + $required_attributes[ $tag_name ] = array(); + + foreach ( $attributes as $attribute_name => $specifier ) { + $attribute_name = strtolower( $attribute_name ); + $allowed_tags[ $tag_name ][ $attribute_name ] = $specifier; + + if ( isset( $specifier['required'] ) ) { + $required_attributes[ $tag_name ][] = $attribute_name; + } + } + } + } + $html = ''; while ( $this->next_token() ) { $token_type = $this->get_token_type(); switch ( $token_type ) { case '#text': - $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + if ( ! in_array( '#text', $removed_tags, true ) ) { + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + } break; // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. @@ -1134,10 +1197,16 @@ public function serialize(): ?string { break; case '#funky-comment': - $html .= ""; + if ( ! in_array( '#funky-comment', $removed_tags, true ) ) { + $html .= ""; + } break; case '#comment': + if ( in_array( '#comment', $removed_tags, true ) ) { + break; + } + switch ( $this->get_comment_type() ) { case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE: $html .= ""; @@ -1153,11 +1222,19 @@ public function serialize(): ?string { break; case '#cdata-section': - $html .= "get_modifiable_text()}]]>"; + if ( ! in_array( '#cdata-section', $removed_tags, true ) ) { + $html .= "get_modifiable_text()}]]>"; + } break; - case 'html': - $html .= ''; + case '#doctype': + if ( ! in_array( '#doctype', $removed_tags, true ) ) { + if ( WP_HTML_Tag_Processor::NO_QUIRKS_MODE === $this->compat_mode ) { + $html .= ''; + } else { + $html .= ''; + } + } break; } @@ -1166,28 +1243,123 @@ public function serialize(): ?string { } $tag_name = $this->get_tag(); - $in_html = 'html' === $this->get_namespace(); + $namespace = $this->get_namespace(); + $in_html = 'html' === $namespace; $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); + $allowable_name = $in_html ? $tag_name : ( strtoupper( $namespace ) . " ${tag_name}" ); - if ( $this->is_tag_closer() ) { - $html .= ""; + // Remove subtree for tags marked as `remove-node`. + if ( in_array( $allowable_name, $removed_tags, true ) ) { + $this_depth = $this->get_current_depth(); + while ( $this->get_current_depth() >= $this_depth && $this->next_token() ) { + continue; + } continue; } - $attribute_names = $this->get_attribute_names_with_prefix( '' ); - if ( ! isset( $attribute_names ) ) { - $html .= "<{$qualified_name}>"; + // Skip tags which aren't allowed: this covers opening _and_ closing tags. + if ( isset( $allowed_html ) && ! isset( $allowed_tags[ $allowable_name ] ) ) { continue; } - $html .= "<{$qualified_name}"; + // Skip tags lacking a required attribute. + foreach ( $required_attributes[ $allowable_name ] ?? array() as $required_name ) { + if ( null === $this->get_attribute( $required_name ) ) { + $skip_closers[] = array( $allowable_name, $this->get_current_depth() ); + continue 2; + } + } + + if ( $this->is_tag_closer() ) { + list( $skippable_name, $skippable_depth ) = end( $skip_closers ) ?? array( null, null ); + if ( $skippable_name === $allowable_name && $skippable_depth === $this->get_current_depth() ) { + array_pop( $skip_closers ); + } else { + $html .= ""; + } + continue; + } + + if ( isset( $allowed_html ) ) { + $attribute_names = array_keys( $allowed_tags[ $allowable_name ] ); + } else { + $attribute_names = $this->get_attribute_names_with_prefix( '' ) ?? array(); + } + + /* + * Since this iterates the allowed list, if provided, there's no need to + * check if a given attribute is in the allowed list. Without an allowed + * list all the constraints will fail to apply by not being present. + */ + $allowable_attributes = $allowed_tags[ $allowable_name ] ?? array(); + $attribute_string = ''; foreach ( $attribute_names as $attribute_name ) { - $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; - $value = $this->get_attribute( $attribute_name ); + $value = $this->get_attribute( $attribute_name ); + $specifier = $allowable_attributes[ $attribute_name ] ?? array(); - if ( is_string( $value ) ) { - $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + // Handle the attribute specifiers. + if ( isset( $specifier['valueless'] ) && ( 'y' === $specifier['valueless'] || 'Y' === $specifier['valueless'] ) ) { + $attribute_string .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + continue; } + + if ( + isset( $specifier['valueless'] ) && + ( 'n' === $specifier['valueless'] || 'N' === $specifier['valueless'] ) && + true === $value + ) { + continue; + } + + if ( + isset( $specifier['values'] ) && + ! in_array( strtolower( $value ), $specifier['values'], true ) + ) { + continue; + } + + if ( + isset( $specifier['value_callback'] ) && + ! call_user_func( $allowable_attributes[ $attribute_name ]['value_callback'], $value ) + ) { + continue; + } + + // All remaining specifiers apply to string values, not to boolean attributes. + if ( ! is_string( $value ) ) { + $attribute_string .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + continue; + } + + if ( isset( $specifier['maxlen'] ) && strlen( $value ) > $specifier['maxlen'] ) { + continue; + } + + if ( isset( $specifier['minlen'] ) && strlen( $value ) < $specifier['minlen'] ) { + continue; + } + + if ( isset( $specifier['maxval'] ) || isset( $specifier['minval'] ) ) { + if ( ! preg_match( '/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value ) ) { + continue; + } + + if ( + ( isset( $specifier['maxval'] ) && $value > $specifier['maxval'] ) || + ( isset( $specifier['minval'] ) && $value < $specifier['minval'] ) + ) { + continue; + } + } + + $attribute_string .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $attribute_string .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + } + + $html .= "<{$qualified_name}"; + + if ( '' !== $attribute_string ) { + $html .= $attribute_string; } if ( ! $in_html && $this->has_self_closing_flag() ) {