* @license GNU General Public License version 2 or later; see LICENSE.txt */ namespace Joomla\Component\Finder\Administrator\Indexer\Parser; use Joomla\Component\Finder\Administrator\Indexer\Parser; // phpcs:disable PSR1.Files.SideEffects \defined('_JEXEC') or die; // phpcs:enable PSR1.Files.SideEffects /** * HTML Parser class for the Finder indexer package. * * @since 2.5 */ class Html extends Parser { /** * Method to parse input and extract the plain text. Because this method is * called from both inside and outside the indexer, it needs to be able to * batch out its parsing functionality to deal with the inefficiencies of * regular expressions. We will parse recursively in 2KB chunks. * * @param string $input The input to parse. * * @return string The plain text input. * * @since 2.5 */ public function parse($input) { // Strip invalid UTF-8 characters. $oldSetting = ini_get('mbstring.substitute_character'); ini_set('mbstring.substitute_character', 'none'); $input = mb_convert_encoding($input, 'UTF-8', 'UTF-8'); ini_set('mbstring.substitute_character', $oldSetting); // Remove anything between and tags. Do this first // because there might be '); // Decode HTML entities. $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); // Convert entities equivalent to spaces to actual spaces. $input = str_replace(array(' ', ' '), ' ', $input); // Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements, // e.g. 'all

mobile List

' will become 'all mobile List' $input = preg_replace('/(<|<\/)(' . 'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' . 'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' . 'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' . ')\b/i', ' $1$2', $input); // Strip HTML tags. $input = strip_tags($input); return parent::parse($input); } /** * Method to process HTML input and extract the plain text. * * @param string $input The input to process. * * @return string The plain text input. * * @since 2.5 */ protected function process($input) { // Replace any amount of white space with a single space. return preg_replace('#\s+#u', ' ', $input); } /** * Method to remove blocks of text between a start and an end tag. * Each block removed is effectively replaced by a single space. * * Note: The start tag and the end tag must be different. * Note: Blocks must not be nested. * Note: This method will function correctly with multi-byte strings. * * @param string $input String to be processed. * @param string $startTag String representing the start tag. * @param string $endTag String representing the end tag. * * @return string with blocks removed. * * @since 3.4 */ private function removeBlocks($input, $startTag, $endTag) { $return = ''; $offset = 0; $startTagLength = strlen($startTag); $endTagLength = strlen($endTag); // Find the first start tag. $start = stripos($input, $startTag); // If no start tags were found, return the string unchanged. if ($start === false) { return $input; } // Look for all blocks defined by the start and end tags. while ($start !== false) { // Accumulate the substring up to the start tag. $return .= substr($input, $offset, $start - $offset) . ' '; // Look for an end tag corresponding to the start tag. $end = stripos($input, $endTag, $start + $startTagLength); // If no corresponding end tag, leave the string alone. if ($end === false) { // Fix the offset so part of the string is not duplicated. $offset = $start; break; } // Advance the start position. $offset = $end + $endTagLength; // Look for the next start tag and loop. $start = stripos($input, $startTag, $offset); } // Add in the final substring after the last end tag. $return .= substr($input, $offset); return $return; } }