From bab77fadd6013e2b83f3eb25a28cfbfe9e7a0aa1 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 13 Sep 2023 23:45:03 -0700 Subject: [PATCH 1/3] WIP: HTML API: Extract previous text and HTML chunks while processing. The HTML API should be able to provide the ability to generate excerpts from HTMl documents given a specific maximum length. In this patch we're exploring the addition of text and HTML chunks that can be extracted while processing in order to do just this. The text chunks are similar to `.textContent` on the DOM while the HTML chunks contain raw and unprocessed HTML. These functions should likely remain low-level in the Tag Processor and be exposed from the HTML Processor to ensure that proper semantics are heeded when extracting this information, such as how `PRE` tags ignore a leading newline inside their content or how `SCRIPT` and `STYLE` content isn't part of what we want with something like `strip_tags()`. In the process of this work it's evident again that the Tag Processor ought to expose the ability to visit every token and non-tag tokens should be classified. This has already been explored in dmsnell/wordpress-develop#7. --- .../html-api/class-wp-html-tag-processor.php | 89 ++++++++++++++ .../wpHtmlProcessor-stringBuilder.php | 114 ++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 79d96dc8be3f1..e6ac98daae6ab 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -378,6 +378,15 @@ class WP_HTML_Tag_Processor { */ private $is_closing_tag; + /** + * Stores the position of the last-matched tag, or the start of the document if not matched yet. + * + * @var WP_HTML_Span + */ + private $last_position = null; + + private $last_token_end = 0; + /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. * @@ -507,6 +516,8 @@ class WP_HTML_Tag_Processor { */ public function __construct( $html ) { $this->html = $html; + + $this->last_position = new WP_HTML_Span( 0, 0 ); } /** @@ -530,6 +541,16 @@ public function next_tag( $query = null ) { $this->parse_query( $query ); $already_found = 0; + if ( null !== $this->tag_name_starts_at ) { + $rewind_amount = $this->is_closing_tag ? 2 : 1; + $before_tag = $this->tag_name_starts_at - $rewind_amount; + $end_of_tag = $this->tag_ends_at; + + $this->last_position->start = $before_tag; + $this->last_position->end = $end_of_tag; + $this->last_token_end = $this->tag_ends_at + 1; + } + do { if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; @@ -1876,6 +1897,74 @@ public function is_tag_closer() { return $this->is_closing_tag; } + /** + * Returns the chunk of text from the end of the preceding tag or token to the + * start of the matched tag or token, with decoded character references. + * + * Example: + * + * $q = array( 'tag_closers' => 'visit' ); + * $processor = new WP_HTML_Tag_Processor( 'Before
Inside
After' ); + * $processor->next_tag( $q ); 'Before' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); 'Inside' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); 'After' === $processor->get_prev_text_chunk(); + * + * @since 6.4.0 + * + * @return string|null Chunk of text from end of last token to current token, or NULL if not yet matched. + */ + public function get_previous_text_chunk() { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $chunk = substr( $this->html, $this->last_position->end === 0 ? 0 : $this->last_position->end + 1 ); + $chunk = preg_replace( '/<[^a-z].*>/i', '', $chunk ); + return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE ); + } + + if ( ! $this->tag_name_starts_at ) { + return null; + } + + $chunk_start = $this->last_position->end === 0 ? 0 : $this->last_position->end + 1; + $chunk_end = $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1; + $chunk = substr( $this->html, $chunk_start, $chunk_end - $chunk_start ); + $chunk = preg_replace( '/<[^a-z].*>/i', '', $chunk ); + return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE ); + } + + /** + * Returns the chunk of html from the start of the preceding tag or token to the + * start of the matched tag or token, without decoded character references. + * + * Example: + * + * $q = array( 'tag_closers' => 'visit' ); + * $processor = new WP_HTML_Tag_Processor( 'Before
Inside
After' ); + * $processor->next_tag( $q ); 'Before' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); '
Inside' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); '
After' === $processor->get_prev_text_chunk(); + * + * @since 6.4.0 + * + * @return array|null Chunk of text from end of last token to current token, or NULL if not yet matched. + */ + public function get_previous_html_chunk() { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $html = substr( $this->html, $this->last_position->start, $this->last_token_end - $this->last_position->start ); + $text = substr( $this->html, $this->last_token_end ); + + return array( $html, $text ); + } + + if ( ! $this->tag_name_starts_at ) { + return null; + } + + $html = substr( $this->html, $this->last_position->start, $this->last_token_end - $this->last_position->start ); + $text = substr( $this->html, $this->last_token_end, ( $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1 ) - $this->last_token_end ); + + return array( $html, $text ); + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php new file mode 100644 index 0000000000000..4511e093a60ad --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php @@ -0,0 +1,114 @@ +next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $extracted_text_content .= $processor->get_previous_text_chunk(); + } + $extracted_text_content .= $processor->get_previous_text_chunk(); + + $this->assertEquals( $text_content, $extracted_text_content, 'Extracted unexpected text content.' ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public function data_html_and_associated_text_content() { + return array( + 'Basic text without HTML.' => array( 'This is plain text.', 'This is plain text.' ), + 'Basic text with a character reference.' => array( 'A < B', 'A < B' ), + 'Text before tag.' => array( 'Before', 'Before' ), + 'Text after tag.' => array( 'After', 'After' ), + 'Text inside tag.' => array( '
Inside
', 'Inside' ), + 'Text around tag.' => array( 'In the jungle.', 'In the jungle.' ), + 'Text interrupted by many tags.' => array( 'A wild adventure awaits.', 'A wild adventure awaits.' ), + 'Text with comment inside it.' => array( 'Ignore comment.', 'Ignore comment.' ), + 'Text with empty comment inside it.' => array( 'Ignore comment.', 'Ignore comment.' ), + 'Text with invalid comment inside it.' => array( 'Ignore comment.', 'Ignore comment.' ), + 'Skipping SCRIPT content.' => array( '
This in the output.', 'This in the output.' ), + ); + } + + /** + * @ticket {TICKET_NUMBER} + * + * @dataProvider data_html_and_associated_html_content + * + * @param string $html HTML containing text that should be extracted. + * @param int $max_code_points Stop iterating after this many code points have been extracted. + * @param string $html_content Full HTML containing text of max code point length from input. + */ + public function test_extracts_html_chunks_properly( $html, $max_code_points, $html_content ) { + $processor = new WP_HTML_Tag_Processor( $html ); + + $code_points = 0; + $extracted_html_content = ''; + while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $text_chunk = $processor->get_previous_text_chunk(); + $chunk_cps = mb_strlen( $text_chunk ); + list( $html, $text ) = $processor->get_previous_html_chunk(); + $extracted_html_content .= $html; + if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) { + $extracted_html_content .= $text; + $code_points += $chunk_cps; + } else { + break; + } + } + + $text_chunk = $processor->get_previous_text_chunk(); + $chunk_cps = mb_strlen( $text_chunk ); + list( $html, $text ) = $processor->get_previous_html_chunk(); + $extracted_html_content .= $html; + if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) { + $extracted_html_content .= $text; + } + + $this->assertEquals( $html_content, $extracted_html_content, 'Extracted unexpected HTML content.' ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public function data_html_and_associated_html_content() { + return array( + 'Basic text without HTML.' => array( 'This is plain text.', 0, 'This is plain text.' ), + 'Basic text without HTML (too long).' => array( 'This is plain text.', 8, '' ), + 'Basic text with a character reference.' => array( 'A < B', 0, 'A < B' ), + 'Character reference wider than text' => array( 'A < B', 5, 'A < B' ), + 'Text before tag.' => array( 'Before', 0, 'Before' ), + 'Text after tag.' => array( 'After', 0, 'After' ), + 'Text inside tag.' => array( '
Inside
', 0, '
Inside
' ), + 'Text around tag.' => array( 'In the jungle.', 0, 'In the jungle.' ), + 'Text interrupted by many tags.' => array( 'A wild adventure awaits.', 0, 'A wild adventure awaits.' ), + 'Text interrupted by many tags (long).' => array( 'A wild adventure awaits.', 16, 'A wild adventure' ), + 'Text with comment inside it.' => array( 'Ignore comment.', 0, 'Ignore comment.' ), + ); + } +} From 340418b26982a735a0f5a90b7f1e0253295b35e9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 14 Sep 2023 01:02:07 -0700 Subject: [PATCH 2/3] Primitive max-word-count HTML excerpt. --- .../wpHtmlProcessor-stringBuilder.php | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php index 4511e093a60ad..cdfedc3fe579f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php @@ -111,4 +111,61 @@ public function data_html_and_associated_html_content() { 'Text with comment inside it.' => array( 'Ignore comment.', 0, 'Ignore comment.' ), ); } + + /** + * @dataProvider data_html_with_locale_and_excerpt + * + * @param $html + * @param $locale + * @param $word_count + * @return void + */ + public function test_excerpt_of_so_many_words( $html, $locale, $max_word_count, $html_excerpt ) { + $processor = new WP_HTML_Tag_Processor( $html ); + + $excerpt_text = ''; + $excerpt = ''; + $words = IntlBreakIterator::createWordInstance( $locale ); + + while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $word_count = 0; + $excerpt_text .= $processor->get_previous_text_chunk(); + $words->setText( $excerpt_text ); + + list( $html, $text ) = $processor->get_previous_html_chunk(); + $excerpt .= $html; + foreach ( $words as $_ ) { + if ( IntlRuleBasedBreakIterator::WORD_NONE !== $words->getRuleStatus() ) { + $word_count++; + } + + if ( $word_count > $max_word_count ) { + break 2; + } + } + $excerpt .= $text; + } + if ( $word_count <= $max_word_count ) { + list( $html, $text ) = $processor->get_previous_html_chunk(); + $excerpt .= $html; + } + + $this->assertEquals( $html_excerpt, $excerpt, 'Extracted wrong excerpt from document.' ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public function data_html_with_locale_and_excerpt() { + return array( + array( '
This is a with great ability to inspire.
', 'en_US', 3, '
This is a ' ), + array( '
This is a with great ability to inspire.
', 'en_US', 4, '
This is a with ' ), + array( 'What a Thing', 'en_US', 2, 'What a ' ), + array( 'アメリカ人です。', 'jp_JP', 2, '' ), + array( 'アメリカ人です。', 'jp_JP', 4, 'アメリカ人' ), + array( '
שְׁמַע יִשְׂרָאֵל
יְהוָה אֱלֹהֵינוּ
יְהוָה אֶחָֽד
', 'he_IL', 2, '
שְׁמַע יִשְׂרָאֵל
' ), + ); + } } From e4d7d067172d68874882394c2962210ca972cea5 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 18 Sep 2023 16:21:39 -0700 Subject: [PATCH 3/3] Remove quick workaround for removing comments. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index e6ac98daae6ab..2b9a084998da7 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1916,7 +1916,6 @@ public function is_tag_closer() { public function get_previous_text_chunk() { if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $chunk = substr( $this->html, $this->last_position->end === 0 ? 0 : $this->last_position->end + 1 ); - $chunk = preg_replace( '/<[^a-z].*>/i', '', $chunk ); return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE ); } @@ -1927,7 +1926,6 @@ public function get_previous_text_chunk() { $chunk_start = $this->last_position->end === 0 ? 0 : $this->last_position->end + 1; $chunk_end = $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1; $chunk = substr( $this->html, $chunk_start, $chunk_end - $chunk_start ); - $chunk = preg_replace( '/<[^a-z].*>/i', '', $chunk ); return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE ); }