Skip to content

Commit 0bf0e2f

Browse files
committed
HTML API: Refactor wp_kses_hair() (WordPress#9248)
Trac ticket: Core-63694 `wp_kses_hair()` is built around an impressive state machine for parsing the `$attr` of an HTML tag, that is, the span of text after the tag name and before the closing `>`. Unfortunately, that parsing code doesn’t fully-implement the HTML specification and may be prone to mis-parsing. This patch replaces the existing state machine with a straight-forward use of the HTML API to parse the attributes for us, constructing a shell take for the `$attr` string and reading the attributes structurally. This shell is necessary because a previous stage of the pipeline has already separated what it thinks is the so-called “attribute list” from a tag. Props: dmsnell
1 parent 4892d46 commit 0bf0e2f

File tree

1 file changed

+28
-127
lines changed

1 file changed

+28
-127
lines changed

src/wp-includes/kses.php

Lines changed: 28 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1416,149 +1416,50 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
14161416
* attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
14171417
*
14181418
* @since 1.0.0
1419+
* @since 6.9.0 Rebuilt on HTML API
14191420
*
14201421
* @param string $attr Attribute list from HTML element to closing HTML element tag.
14211422
* @param string[] $allowed_protocols Array of allowed URL protocols.
14221423
* @return array[] Array of attribute information after parsing.
14231424
*/
14241425
function wp_kses_hair( $attr, $allowed_protocols ) {
1425-
$attrarr = array();
1426-
$mode = 0;
1427-
$attrname = '';
1428-
$uris = wp_kses_uri_attributes();
1426+
$attributes = array();
1427+
$uris = wp_kses_uri_attributes();
14291428

14301429
// Loop through the whole attribute list.
14311430

1432-
while ( strlen( $attr ) !== 0 ) {
1433-
$working = 0; // Was the last operation successful?
1431+
$processor = new WP_HTML_Tag_Processor( "<wp {$attr}>" );
1432+
$processor->next_token();
14341433

1435-
switch ( $mode ) {
1436-
case 0:
1437-
if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
1438-
$attrname = $match[1];
1439-
$working = 1;
1440-
$mode = 1;
1441-
$attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
1442-
}
1443-
1444-
break;
1445-
1446-
case 1:
1447-
if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
1448-
$working = 1;
1449-
$mode = 2;
1450-
$attr = preg_replace( '/^\s*=\s*/', '', $attr );
1451-
break;
1452-
}
1453-
1454-
if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
1455-
$working = 1;
1456-
$mode = 0;
1457-
1458-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1459-
$attrarr[ $attrname ] = array(
1460-
'name' => $attrname,
1461-
'value' => '',
1462-
'whole' => $attrname,
1463-
'vless' => 'y',
1464-
);
1465-
}
1466-
1467-
$attr = preg_replace( '/^\s+/', '', $attr );
1468-
}
1469-
1470-
break;
1471-
1472-
case 2:
1473-
if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
1474-
// "value"
1475-
$thisval = $match[1];
1476-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1477-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1478-
}
1479-
1480-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1481-
$attrarr[ $attrname ] = array(
1482-
'name' => $attrname,
1483-
'value' => $thisval,
1484-
'whole' => "$attrname=\"$thisval\"",
1485-
'vless' => 'n',
1486-
);
1487-
}
1488-
1489-
$working = 1;
1490-
$mode = 0;
1491-
$attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
1492-
break;
1493-
}
1494-
1495-
if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
1496-
// 'value'
1497-
$thisval = $match[1];
1498-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1499-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1500-
}
1501-
1502-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1503-
$attrarr[ $attrname ] = array(
1504-
'name' => $attrname,
1505-
'value' => $thisval,
1506-
'whole' => "$attrname='$thisval'",
1507-
'vless' => 'n',
1508-
);
1509-
}
1510-
1511-
$working = 1;
1512-
$mode = 0;
1513-
$attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
1514-
break;
1515-
}
1516-
1517-
if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
1518-
// value
1519-
$thisval = $match[1];
1520-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1521-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1522-
}
1523-
1524-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1525-
$attrarr[ $attrname ] = array(
1526-
'name' => $attrname,
1527-
'value' => $thisval,
1528-
'whole' => "$attrname=\"$thisval\"",
1529-
'vless' => 'n',
1530-
);
1531-
}
1532-
1533-
// We add quotes to conform to W3C's HTML spec.
1534-
$working = 1;
1535-
$mode = 0;
1536-
$attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
1537-
}
1434+
foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) {
1435+
$value = $processor->get_attribute( $name );
1436+
$is_bool = true === $value;
1437+
if ( is_string( $value ) && in_array( $name, $uris, true ) ) {
1438+
$value = wp_kses_bad_protocol( $value, $allowed_protocols );
1439+
}
15381440

1539-
break;
1540-
} // End switch.
1441+
// Reconstruct and normalize the attribute value.
1442+
$syntax_characters = array(
1443+
'&' => '&amp;',
1444+
'<' => '&lt;',
1445+
'>' => '&gt;',
1446+
"'" => '&apos;',
1447+
'"' => '&quot;',
1448+
);
15411449

1542-
if ( 0 === $working ) { // Not well-formed, remove and try again.
1543-
$attr = wp_kses_html_error( $attr );
1544-
$mode = 0;
1545-
}
1546-
} // End while.
1450+
$recoded = $is_bool ? '' : strtr( $value, $syntax_characters );
1451+
$whole = $is_bool ? $name : "{$name}=\"{$recoded}\"";
15471452

1548-
if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
1549-
/*
1550-
* Special case, for when the attribute list ends with a valueless
1551-
* attribute like "selected".
1552-
*/
1553-
$attrarr[ $attrname ] = array(
1554-
'name' => $attrname,
1555-
'value' => '',
1556-
'whole' => $attrname,
1557-
'vless' => 'y',
1453+
// @todo What security issue need review on the names?
1454+
$attributes[ $name ] = array(
1455+
'name' => $name,
1456+
'value' => $recoded,
1457+
'whole' => $whole,
1458+
'vless' => $is_bool ? 'y' : 'n',
15581459
);
15591460
}
15601461

1561-
return $attrarr;
1462+
return $attributes;
15621463
}
15631464

15641465
/**

0 commit comments

Comments
 (0)