From 726bcfdc12cec8f3d897779923975410be831df9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 11 Jul 2025 17:25:14 -0500 Subject: [PATCH] HTML API: Refactor `wp_kses_hair()` (#9248) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trac ticket: Core-63694 `wp_kses_hair()` is built around an impressive state machine for parsing the `$attr` of an HTML tag, that is, the span of text after the tag name and before the closing `>`. Unfortunately, that parsing code doesn’t fully-implement the HTML specification and may be prone to mis-parsing. This patch replaces the existing state machine with a straight-forward use of the HTML API to parse the attributes for us, constructing a shell take for the `$attr` string and reading the attributes structurally. This shell is necessary because a previous stage of the pipeline has already separated what it thinks is the so-called “attribute list” from a tag. Props: dmsnell --- src/wp-includes/kses.php | 188 +++++++++------------------------- tests/phpunit/tests/media.php | 2 +- 2 files changed, 51 insertions(+), 139 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 35327e1a01cce..38d6e2a437309 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1585,160 +1585,72 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe } /** - * Builds an attribute list from string containing attributes. - * - * This function does a lot of work. It parses an attribute list into an array - * with attribute data, and tries to do the right thing even if it gets weird - * input. It will add quotes around attribute values that don't have any quotes - * or apostrophes around them, to make it easier to produce HTML code that will - * conform to W3C's HTML specification. It will also remove bad URL protocols - * from attribute values. It also reduces duplicate attributes by using the - * attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`). + * Given a string of HTML attributes and values, parse into a structured attribute list. + * + * This function performs a number of transformations while parsing attribute strings: + * - It normalizes attribute values and surrounds them with double quotes. + * - It normalizes HTML character references inside attribute values. + * - It removes “bad” URL protocols from attribute values. + * + * Otherwise this reads the attributes as if they were part of an HTML tag. It performs + * these transformations to lower the risk of mis-parsing down the line and to perform + * URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does + * not decode the attribute values, meaning that special HTML syntax characters will + * be left with character references in the `value` property. + * + * Example: + * + * $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'<img>\' =/🐮=/' ); + * $attrs === array( + * 'class' => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ), + * 'inert' => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ), + * 'data-lazy' => array( 'name' => 'data-lazy', 'value' => '<img>', 'whole' => 'data-lazy="<img>"', 'vless' => 'n' ), + * '=' => array( 'name' => '=', 'value' => '', 'whole' => '=', vless => 'y' ), + * '🐮' => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', vless => 'n' ), + * ); * * @since 1.0.0 + * @since 6.9.0 Rebuilt on HTML API * * @param string $attr Attribute list from HTML element to closing HTML element tag. * @param string[] $allowed_protocols Array of allowed URL protocols. * @return array[] Array of attribute information after parsing. */ function wp_kses_hair( $attr, $allowed_protocols ) { - $attrarr = array(); - $mode = 0; - $attrname = ''; - $uris = wp_kses_uri_attributes(); - - // Loop through the whole attribute list. - - while ( strlen( $attr ) !== 0 ) { - $working = 0; // Was the last operation successful? - - switch ( $mode ) { - case 0: - if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) { - $attrname = $match[1]; - $working = 1; - $mode = 1; - $attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr ); - } - - break; - - case 1: - if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign. - $working = 1; - $mode = 2; - $attr = preg_replace( '/^\s*=\s*/', '', $attr ); - break; - } - - if ( preg_match( '/^\s+/', $attr ) ) { // Valueless. - $working = 1; - $mode = 0; - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => '', - 'whole' => $attrname, - 'vless' => 'y', - ); - } - - $attr = preg_replace( '/^\s+/', '', $attr ); - } - - break; - - case 2: - if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) { - // "value" - $thisval = $match[1]; - if ( in_array( strtolower( $attrname ), $uris, true ) ) { - $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols ); - } - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => $thisval, - 'whole' => "$attrname=\"$thisval\"", - 'vless' => 'n', - ); - } - - $working = 1; - $mode = 0; - $attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr ); - break; - } + $attributes = array(); + $uris = wp_kses_uri_attributes(); - if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) { - // 'value' - $thisval = $match[1]; - if ( in_array( strtolower( $attrname ), $uris, true ) ) { - $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols ); - } - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => $thisval, - 'whole' => "$attrname='$thisval'", - 'vless' => 'n', - ); - } - - $working = 1; - $mode = 0; - $attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr ); - break; - } + $processor = new WP_HTML_Tag_Processor( "" ); + $processor->next_token(); - if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) { - // value - $thisval = $match[1]; - if ( in_array( strtolower( $attrname ), $uris, true ) ) { - $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols ); - } - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => $thisval, - 'whole' => "$attrname=\"$thisval\"", - 'vless' => 'n', - ); - } - - // We add quotes to conform to W3C's HTML spec. - $working = 1; - $mode = 0; - $attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr ); - } + foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) { + $value = $processor->get_attribute( $name ); + $is_bool = true === $value; + if ( is_string( $value ) && in_array( $name, $uris, true ) ) { + $value = wp_kses_bad_protocol( $value, $allowed_protocols ); + } - break; - } // End switch. + // Reconstruct and normalize the attribute value. + $syntax_characters = array( + '&' => '&', + '<' => '<', + '>' => '>', + "'" => ''', + '"' => '"', + ); - if ( 0 === $working ) { // Not well-formed, remove and try again. - $attr = wp_kses_html_error( $attr ); - $mode = 0; - } - } // End while. + $recoded = $is_bool ? '' : strtr( $value, $syntax_characters ); + $whole = $is_bool ? $name : "{$name}=\"{$recoded}\""; - if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) { - /* - * Special case, for when the attribute list ends with a valueless - * attribute like "selected". - */ - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => '', - 'whole' => $attrname, - 'vless' => 'y', + $attributes[ $name ] = array( + 'name' => $name, + 'value' => $recoded, + 'whole' => $whole, + 'vless' => $is_bool ? 'y' : 'n', ); } - return $attrarr; + return $attributes; } /** diff --git a/tests/phpunit/tests/media.php b/tests/phpunit/tests/media.php index 1ab836fe856d8..84cf96c082f1c 100644 --- a/tests/phpunit/tests/media.php +++ b/tests/phpunit/tests/media.php @@ -227,7 +227,7 @@ public function test_new_img_caption_shortcode_with_html_caption() { $this->assertStringNotContainsString( self::HTML_CONTENT, $mark, - 'Test caption content should not contain the mark surround it: check test setup.' + 'Test caption content should not contain the mark surrounding it: check test setup.' ); $result = img_caption_shortcode(