Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,15 @@ class WP_HTML_Tag_Processor {
*/
private $is_closing_tag;

/**
* Stores the position of the last-matched tag, or the start of the document if not matched yet.
*
* @var WP_HTML_Span
*/
private $last_position = null;

private $last_token_end = 0;

/**
* Lazily-built index of attributes found within an HTML tag, keyed by the attribute name.
*
Expand Down Expand Up @@ -507,6 +516,8 @@ class WP_HTML_Tag_Processor {
*/
public function __construct( $html ) {
$this->html = $html;

$this->last_position = new WP_HTML_Span( 0, 0 );
}

/**
Expand All @@ -530,6 +541,16 @@ public function next_tag( $query = null ) {
$this->parse_query( $query );
$already_found = 0;

if ( null !== $this->tag_name_starts_at ) {
$rewind_amount = $this->is_closing_tag ? 2 : 1;
$before_tag = $this->tag_name_starts_at - $rewind_amount;
$end_of_tag = $this->tag_ends_at;

$this->last_position->start = $before_tag;
$this->last_position->end = $end_of_tag;
$this->last_token_end = $this->tag_ends_at + 1;
}

do {
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
return false;
Expand Down Expand Up @@ -1876,6 +1897,72 @@ public function is_tag_closer() {
return $this->is_closing_tag;
}

/**
* Returns the chunk of text from the end of the preceding tag or token to the
* start of the matched tag or token, with decoded character references.
*
* Example:
*
* $q = array( 'tag_closers' => 'visit' );
* $processor = new WP_HTML_Tag_Processor( 'Before<div>Inside</div>After' );
* $processor->next_tag( $q ); 'Before' === $processor->get_prev_text_chunk();
* $processor->next_tag( $q ); 'Inside' === $processor->get_prev_text_chunk();
* $processor->next_tag( $q ); 'After' === $processor->get_prev_text_chunk();
*
* @since 6.4.0
*
* @return string|null Chunk of text from end of last token to current token, or NULL if not yet matched.
*/
public function get_previous_text_chunk() {
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$chunk = substr( $this->html, $this->last_position->end === 0 ? 0 : $this->last_position->end + 1 );
return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE );
}

if ( ! $this->tag_name_starts_at ) {
return null;
}

$chunk_start = $this->last_position->end === 0 ? 0 : $this->last_position->end + 1;
$chunk_end = $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1;
$chunk = substr( $this->html, $chunk_start, $chunk_end - $chunk_start );
return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE );
}

/**
* Returns the chunk of html from the start of the preceding tag or token to the
* start of the matched tag or token, without decoded character references.
*
* Example:
*
* $q = array( 'tag_closers' => 'visit' );
* $processor = new WP_HTML_Tag_Processor( 'Before<div>Inside</div>After' );
* $processor->next_tag( $q ); 'Before' === $processor->get_prev_text_chunk();
* $processor->next_tag( $q ); '<div>Inside' === $processor->get_prev_text_chunk();
* $processor->next_tag( $q ); '</div>After' === $processor->get_prev_text_chunk();
*
* @since 6.4.0
*
* @return array|null Chunk of text from end of last token to current token, or NULL if not yet matched.
*/
public function get_previous_html_chunk() {
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$html = substr( $this->html, $this->last_position->start, $this->last_token_end - $this->last_position->start );
$text = substr( $this->html, $this->last_token_end );

return array( $html, $text );
}

if ( ! $this->tag_name_starts_at ) {
return null;
}

$html = substr( $this->html, $this->last_position->start, $this->last_token_end - $this->last_position->start );
$text = substr( $this->html, $this->last_token_end, ( $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1 ) - $this->last_token_end );

return array( $html, $text );
}

/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
Expand Down
171 changes: 171 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
<?php
/**
* Unit tests covering WP_HTML_Processor string building functionality.
*
* @package WordPress
* @subpackage HTML-API
*/

/**
* @group html-api
*
* @coversDefaultClass WP_HTML_Processor
*/
class Tests_HtmlApi_WpHtmlProcessor_StringBuilder extends WP_UnitTestCase {
/**
* @ticket {TICKET_NUMBER}
*
* @dataProvider data_html_and_associated_text_content
*
* @param string $html HTML containing text that should be extracted.
* @param string $text_content Plaintext content represented inside the given HTML.
*/
public function test_extracts_text_chunks_properly( $html, $text_content ) {
$processor = new WP_HTML_Tag_Processor( $html );

$extracted_text_content = '';
while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
$extracted_text_content .= $processor->get_previous_text_chunk();
}
$extracted_text_content .= $processor->get_previous_text_chunk();

$this->assertEquals( $text_content, $extracted_text_content, 'Extracted unexpected text content.' );
}

/**
* Data provider.
*
* @return array[].
*/
public function data_html_and_associated_text_content() {
return array(
'Basic text without HTML.' => array( 'This is plain text.', 'This is plain text.' ),
'Basic text with a character reference.' => array( 'A &lt; B', 'A < B' ),
'Text before tag.' => array( 'Before<img>', 'Before' ),
'Text after tag.' => array( '<img>After', 'After' ),
'Text inside tag.' => array( '<div>Inside</div>', 'Inside' ),
'Text around tag.' => array( 'In <em>the</em> jungle.', 'In the jungle.' ),
'Text interrupted by many tags.' => array( 'A <em>wild <a><img><span>adventure</span></a> awaits.', 'A wild adventure awaits.' ),
'Text with comment inside it.' => array( 'Ignore <!-- everything inside this --> comment.', 'Ignore comment.' ),
'Text with empty comment inside it.' => array( 'Ignore <!--> comment.', 'Ignore comment.' ),
'Text with invalid comment inside it.' => array( 'Ignore </^$%> comment.', 'Ignore comment.' ),
'Skipping SCRIPT content.' => array( '<div>This <script>does not exist</script> in the output.', 'This in the output.' ),
);
}

/**
* @ticket {TICKET_NUMBER}
*
* @dataProvider data_html_and_associated_html_content
*
* @param string $html HTML containing text that should be extracted.
* @param int $max_code_points Stop iterating after this many code points have been extracted.
* @param string $html_content Full HTML containing text of max code point length from input.
*/
public function test_extracts_html_chunks_properly( $html, $max_code_points, $html_content ) {
$processor = new WP_HTML_Tag_Processor( $html );

$code_points = 0;
$extracted_html_content = '';
while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
$text_chunk = $processor->get_previous_text_chunk();
$chunk_cps = mb_strlen( $text_chunk );
list( $html, $text ) = $processor->get_previous_html_chunk();
$extracted_html_content .= $html;
if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) {
$extracted_html_content .= $text;
$code_points += $chunk_cps;
} else {
break;
}
}

$text_chunk = $processor->get_previous_text_chunk();
$chunk_cps = mb_strlen( $text_chunk );
list( $html, $text ) = $processor->get_previous_html_chunk();
$extracted_html_content .= $html;
if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) {
$extracted_html_content .= $text;
}

$this->assertEquals( $html_content, $extracted_html_content, 'Extracted unexpected HTML content.' );
}

/**
* Data provider.
*
* @return array[].
*/
public function data_html_and_associated_html_content() {
return array(
'Basic text without HTML.' => array( 'This is plain text.', 0, 'This is plain text.' ),
'Basic text without HTML (too long).' => array( 'This is plain text.', 8, '' ),
'Basic text with a character reference.' => array( 'A &lt; B', 0, 'A &lt; B' ),
'Character reference wider than text' => array( 'A &lt; B', 5, 'A &lt; B' ),
'Text before tag.' => array( 'Before<img>', 0, 'Before<img>' ),
'Text after tag.' => array( '<img>After', 0, '<img>After' ),
'Text inside tag.' => array( '<div>Inside</div>', 0, '<div>Inside</div>' ),
'Text around tag.' => array( 'In <em>the</em> jungle.', 0, 'In <em>the</em> jungle.' ),
'Text interrupted by many tags.' => array( 'A <em>wild <a><img><span>adventure</span></a> awaits.', 0, 'A <em>wild <a><img><span>adventure</span></a> awaits.' ),
'Text interrupted by many tags (long).' => array( 'A <em>wild <a><img><span>adventure</span></a> awaits.', 16, 'A <em>wild <a><img><span>adventure</span></a>' ),
'Text with comment inside it.' => array( 'Ignore <!-- everything inside this --> comment.', 0, 'Ignore <!-- everything inside this --> comment.' ),
);
}

/**
* @dataProvider data_html_with_locale_and_excerpt
*
* @param $html
* @param $locale
* @param $word_count
* @return void
*/
public function test_excerpt_of_so_many_words( $html, $locale, $max_word_count, $html_excerpt ) {
$processor = new WP_HTML_Tag_Processor( $html );

$excerpt_text = '';
$excerpt = '';
$words = IntlBreakIterator::createWordInstance( $locale );

while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
$word_count = 0;
$excerpt_text .= $processor->get_previous_text_chunk();
$words->setText( $excerpt_text );

list( $html, $text ) = $processor->get_previous_html_chunk();
$excerpt .= $html;
foreach ( $words as $_ ) {
if ( IntlRuleBasedBreakIterator::WORD_NONE !== $words->getRuleStatus() ) {
$word_count++;
}

if ( $word_count > $max_word_count ) {
break 2;
}
}
$excerpt .= $text;
}
if ( $word_count <= $max_word_count ) {
list( $html, $text ) = $processor->get_previous_html_chunk();
$excerpt .= $html;
}

$this->assertEquals( $html_excerpt, $excerpt, 'Extracted wrong excerpt from document.' );
}

/**
* Data provider.
*
* @return array[].
*/
public function data_html_with_locale_and_excerpt() {
return array(
array( '<div>This is a <img> with <em>great</em> ability to inspire.</div>', 'en_US', 3, '<div>This is a <img>' ),
array( '<div>This is a <img> with <em>great</em> ability to inspire.</div>', 'en_US', 4, '<div>This is a <img> with <em>' ),
array( '<em>W</em>hat a <i>T</i>hing', 'en_US', 2, '<em>W</em>hat a <i>' ),
array( '<span>彼</span>は<em>アメリカ人</em>です。', 'jp_JP', 2, '<span>彼</span>は<em>' ),
array( '<span>彼</span>は<em>アメリカ人</em>です。', 'jp_JP', 4, '<span>彼</span>は<em>アメリカ人</em>' ),
array( '<div>שְׁמַע יִשְׂרָאֵל<br> יְהוָה אֱלֹהֵינוּ<br> יְהוָה אֶחָֽד</div>', 'he_IL', 2, '<div>שְׁמַע יִשְׂרָאֵל<br>' ),
);
}
}