[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * @package Joomla.Administrator 5 * @subpackage com_finder 6 * 7 * @copyright (C) 2011 Open Source Matters, Inc. <https://www.joomla.org> 8 * @license GNU General Public License version 2 or later; see LICENSE.txt 9 */ 10 11 namespace Joomla\Component\Finder\Administrator\Indexer\Parser; 12 13 use Joomla\Component\Finder\Administrator\Indexer\Parser; 14 15 // phpcs:disable PSR1.Files.SideEffects 16 \defined('_JEXEC') or die; 17 // phpcs:enable PSR1.Files.SideEffects 18 19 /** 20 * HTML Parser class for the Finder indexer package. 21 * 22 * @since 2.5 23 */ 24 class Html extends Parser 25 { 26 /** 27 * Method to parse input and extract the plain text. Because this method is 28 * called from both inside and outside the indexer, it needs to be able to 29 * batch out its parsing functionality to deal with the inefficiencies of 30 * regular expressions. We will parse recursively in 2KB chunks. 31 * 32 * @param string $input The input to parse. 33 * 34 * @return string The plain text input. 35 * 36 * @since 2.5 37 */ 38 public function parse($input) 39 { 40 // Strip invalid UTF-8 characters. 41 $oldSetting = ini_get('mbstring.substitute_character'); 42 ini_set('mbstring.substitute_character', 'none'); 43 $input = mb_convert_encoding($input, 'UTF-8', 'UTF-8'); 44 ini_set('mbstring.substitute_character', $oldSetting); 45 46 // Remove anything between <head> and </head> tags. Do this first 47 // because there might be <script> or <style> tags nested inside. 48 $input = $this->removeBlocks($input, '<head>', '</head>'); 49 50 // Convert <style> and <noscript> tags to <script> tags 51 // so we can remove them efficiently. 52 $search = array( 53 '<style', '</style', 54 '<noscript', '</noscript', 55 ); 56 $replace = array( 57 '<script', '</script', 58 '<script', '</script', 59 ); 60 $input = str_replace($search, $replace, $input); 61 62 // Strip all script blocks. 63 $input = $this->removeBlocks($input, '<script', '</script>'); 64 65 // Decode HTML entities. 66 $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); 67 68 // Convert entities equivalent to spaces to actual spaces. 69 $input = str_replace(array(' ', ' '), ' ', $input); 70 71 // Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements, 72 // e.g. 'all<h1><em>m</em>obile List</h1>' will become 'all mobile List' 73 $input = preg_replace('/(<|<\/)(' . 74 'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' . 75 'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' . 76 'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' . 77 ')\b/i', ' $1$2', $input); 78 79 // Strip HTML tags. 80 $input = strip_tags($input); 81 82 return parent::parse($input); 83 } 84 85 /** 86 * Method to process HTML input and extract the plain text. 87 * 88 * @param string $input The input to process. 89 * 90 * @return string The plain text input. 91 * 92 * @since 2.5 93 */ 94 protected function process($input) 95 { 96 // Replace any amount of white space with a single space. 97 return preg_replace('#\s+#u', ' ', $input); 98 } 99 100 /** 101 * Method to remove blocks of text between a start and an end tag. 102 * Each block removed is effectively replaced by a single space. 103 * 104 * Note: The start tag and the end tag must be different. 105 * Note: Blocks must not be nested. 106 * Note: This method will function correctly with multi-byte strings. 107 * 108 * @param string $input String to be processed. 109 * @param string $startTag String representing the start tag. 110 * @param string $endTag String representing the end tag. 111 * 112 * @return string with blocks removed. 113 * 114 * @since 3.4 115 */ 116 private function removeBlocks($input, $startTag, $endTag) 117 { 118 $return = ''; 119 $offset = 0; 120 $startTagLength = strlen($startTag); 121 $endTagLength = strlen($endTag); 122 123 // Find the first start tag. 124 $start = stripos($input, $startTag); 125 126 // If no start tags were found, return the string unchanged. 127 if ($start === false) { 128 return $input; 129 } 130 131 // Look for all blocks defined by the start and end tags. 132 while ($start !== false) { 133 // Accumulate the substring up to the start tag. 134 $return .= substr($input, $offset, $start - $offset) . ' '; 135 136 // Look for an end tag corresponding to the start tag. 137 $end = stripos($input, $endTag, $start + $startTagLength); 138 139 // If no corresponding end tag, leave the string alone. 140 if ($end === false) { 141 // Fix the offset so part of the string is not duplicated. 142 $offset = $start; 143 break; 144 } 145 146 // Advance the start position. 147 $offset = $end + $endTagLength; 148 149 // Look for the next start tag and loop. 150 $start = stripos($input, $startTag, $offset); 151 } 152 153 // Add in the final substring after the last end tag. 154 $return .= substr($input, $offset); 155 156 return $return; 157 } 158 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |