This in the output.', 'This in the output.' ),
+ );
+ }
+
+ /**
+ * @ticket {TICKET_NUMBER}
+ *
+ * @dataProvider data_html_and_associated_html_content
+ *
+ * @param string $html HTML containing text that should be extracted.
+ * @param int $max_code_points Stop iterating after this many code points have been extracted.
+ * @param string $html_content Full HTML containing text of max code point length from input.
+ */
+ public function test_extracts_html_chunks_properly( $html, $max_code_points, $html_content ) {
+ $processor = new WP_HTML_Tag_Processor( $html );
+
+ $code_points = 0;
+ $extracted_html_content = '';
+ while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
+ $text_chunk = $processor->get_previous_text_chunk();
+ $chunk_cps = mb_strlen( $text_chunk );
+ list( $html, $text ) = $processor->get_previous_html_chunk();
+ $extracted_html_content .= $html;
+ if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) {
+ $extracted_html_content .= $text;
+ $code_points += $chunk_cps;
+ } else {
+ break;
+ }
+ }
+
+ $text_chunk = $processor->get_previous_text_chunk();
+ $chunk_cps = mb_strlen( $text_chunk );
+ list( $html, $text ) = $processor->get_previous_html_chunk();
+ $extracted_html_content .= $html;
+ if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) {
+ $extracted_html_content .= $text;
+ }
+
+ $this->assertEquals( $html_content, $extracted_html_content, 'Extracted unexpected HTML content.' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public function data_html_and_associated_html_content() {
+ return array(
+ 'Basic text without HTML.' => array( 'This is plain text.', 0, 'This is plain text.' ),
+ 'Basic text without HTML (too long).' => array( 'This is plain text.', 8, '' ),
+ 'Basic text with a character reference.' => array( 'A < B', 0, 'A < B' ),
+ 'Character reference wider than text' => array( 'A < B', 5, 'A < B' ),
+ 'Text before tag.' => array( 'Before
![]()
', 0, 'Before
![]()
' ),
+ 'Text after tag.' => array( '
![]()
After', 0, '
![]()
After' ),
+ 'Text inside tag.' => array( '
Inside
', 0, '
Inside
' ),
+ 'Text around tag.' => array( 'In
the jungle.', 0, 'In
the jungle.' ),
+ 'Text interrupted by many tags.' => array( 'A
wild
adventure awaits.', 0, 'A wild
adventure awaits.' ),
+ 'Text interrupted by many tags (long).' => array( 'A wild
adventure awaits.', 16, 'A wild
adventure' ),
+ 'Text with comment inside it.' => array( 'Ignore comment.', 0, 'Ignore comment.' ),
+ );
+ }
+
+ /**
+ * @dataProvider data_html_with_locale_and_excerpt
+ *
+ * @param $html
+ * @param $locale
+ * @param $word_count
+ * @return void
+ */
+ public function test_excerpt_of_so_many_words( $html, $locale, $max_word_count, $html_excerpt ) {
+ $processor = new WP_HTML_Tag_Processor( $html );
+
+ $excerpt_text = '';
+ $excerpt = '';
+ $words = IntlBreakIterator::createWordInstance( $locale );
+
+ while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
+ $word_count = 0;
+ $excerpt_text .= $processor->get_previous_text_chunk();
+ $words->setText( $excerpt_text );
+
+ list( $html, $text ) = $processor->get_previous_html_chunk();
+ $excerpt .= $html;
+ foreach ( $words as $_ ) {
+ if ( IntlRuleBasedBreakIterator::WORD_NONE !== $words->getRuleStatus() ) {
+ $word_count++;
+ }
+
+ if ( $word_count > $max_word_count ) {
+ break 2;
+ }
+ }
+ $excerpt .= $text;
+ }
+ if ( $word_count <= $max_word_count ) {
+ list( $html, $text ) = $processor->get_previous_html_chunk();
+ $excerpt .= $html;
+ }
+
+ $this->assertEquals( $html_excerpt, $excerpt, 'Extracted wrong excerpt from document.' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public function data_html_with_locale_and_excerpt() {
+ return array(
+ array( 'This is a
![]()
with
great ability to inspire.
', 'en_US', 3, 'This is a
![]()
' ),
+ array( '
This is a
![]()
with
great ability to inspire.
', 'en_US', 4, '
This is a
![]()
with
' ),
+ array( 'What a Thing', 'en_US', 2, 'What a ' ),
+ array( '彼はアメリカ人です。', 'jp_JP', 2, '彼は' ),
+ array( '彼はアメリカ人です。', 'jp_JP', 4, '彼はアメリカ人' ),
+ array( 'שְׁמַע יִשְׂרָאֵל
יְהוָה אֱלֹהֵינוּ
יְהוָה אֶחָֽד
', 'he_IL', 2, 'שְׁמַע יִשְׂרָאֵל
' ),
+ );
+ }
+}