PHPXRef 0.7.1 : Joomla 4.2.2 documentation : /administrator/components/com_finder/src/Indexer/Parser/Html.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /**
   4   * @package     Joomla.Administrator
   5   * @subpackage  com_finder
   6   *
   7   * @copyright   (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
   8   * @license     GNU General Public License version 2 or later; see LICENSE.txt
   9   */
  10  
  11  namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
  12  
  13  use Joomla\Component\Finder\Administrator\Indexer\Parser;
  14  
  15  // phpcs:disable PSR1.Files.SideEffects
  16  \defined('_JEXEC') or die;
  17  // phpcs:enable PSR1.Files.SideEffects
  18  
  19  /**
  20   * HTML Parser class for the Finder indexer package.
  21   *
  22   * @since  2.5
  23   */
  24  class Html extends Parser
  25  {
  26      /**
  27       * Method to parse input and extract the plain text. Because this method is
  28       * called from both inside and outside the indexer, it needs to be able to
  29       * batch out its parsing functionality to deal with the inefficiencies of
  30       * regular expressions. We will parse recursively in 2KB chunks.
  31       *
  32       * @param   string  $input  The input to parse.
  33       *
  34       * @return  string  The plain text input.
  35       *
  36       * @since   2.5
  37       */
  38      public function parse($input)
  39      {
  40          // Strip invalid UTF-8 characters.
  41          $oldSetting = ini_get('mbstring.substitute_character');
  42          ini_set('mbstring.substitute_character', 'none');
  43          $input = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
  44          ini_set('mbstring.substitute_character', $oldSetting);
  45  
  46          // Remove anything between <head> and </head> tags.  Do this first
  47          // because there might be <script> or <style> tags nested inside.
  48          $input = $this->removeBlocks($input, '<head>', '</head>');
  49  
  50          // Convert <style> and <noscript> tags to <script> tags
  51          // so we can remove them efficiently.
  52          $search = array(
  53              '<style', '</style',
  54              '<noscript', '</noscript',
  55          );
  56          $replace = array(
  57              '<script', '</script',
  58              '<script', '</script',
  59          );
  60          $input = str_replace($search, $replace, $input);
  61  
  62          // Strip all script blocks.
  63          $input = $this->removeBlocks($input, '<script', '</script>');
  64  
  65          // Decode HTML entities.
  66          $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
  67  
  68          // Convert entities equivalent to spaces to actual spaces.
  69          $input = str_replace(array('&nbsp;', '&#160;'), ' ', $input);
  70  
  71          // Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements,
  72          // e.g. 'all<h1><em>m</em>obile  List</h1>' will become 'all mobile  List'
  73          $input = preg_replace('/(<|<\/)(' .
  74              'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' .
  75              'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' .
  76              'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' .
  77              ')\b/i', ' $1$2', $input);
  78  
  79          // Strip HTML tags.
  80          $input = strip_tags($input);
  81  
  82          return parent::parse($input);
  83      }
  84  
  85      /**
  86       * Method to process HTML input and extract the plain text.
  87       *
  88       * @param   string  $input  The input to process.
  89       *
  90       * @return  string  The plain text input.
  91       *
  92       * @since   2.5
  93       */
  94      protected function process($input)
  95      {
  96          // Replace any amount of white space with a single space.
  97          return preg_replace('#\s+#u', ' ', $input);
  98      }
  99  
 100      /**
 101       * Method to remove blocks of text between a start and an end tag.
 102       * Each block removed is effectively replaced by a single space.
 103       *
 104       * Note: The start tag and the end tag must be different.
 105       * Note: Blocks must not be nested.
 106       * Note: This method will function correctly with multi-byte strings.
 107       *
 108       * @param   string  $input     String to be processed.
 109       * @param   string  $startTag  String representing the start tag.
 110       * @param   string  $endTag    String representing the end tag.
 111       *
 112       * @return  string with blocks removed.
 113       *
 114       * @since   3.4
 115       */
 116      private function removeBlocks($input, $startTag, $endTag)
 117      {
 118          $return = '';
 119          $offset = 0;
 120          $startTagLength = strlen($startTag);
 121          $endTagLength = strlen($endTag);
 122  
 123          // Find the first start tag.
 124          $start = stripos($input, $startTag);
 125  
 126          // If no start tags were found, return the string unchanged.
 127          if ($start === false) {
 128              return $input;
 129          }
 130  
 131          // Look for all blocks defined by the start and end tags.
 132          while ($start !== false) {
 133              // Accumulate the substring up to the start tag.
 134              $return .= substr($input, $offset, $start - $offset) . ' ';
 135  
 136              // Look for an end tag corresponding to the start tag.
 137              $end = stripos($input, $endTag, $start + $startTagLength);
 138  
 139              // If no corresponding end tag, leave the string alone.
 140              if ($end === false) {
 141                  // Fix the offset so part of the string is not duplicated.
 142                  $offset = $start;
 143                  break;
 144              }
 145  
 146              // Advance the start position.
 147              $offset = $end + $endTagLength;
 148  
 149              // Look for the next start tag and loop.
 150              $start = stripos($input, $startTag, $offset);
 151          }
 152  
 153          // Add in the final substring after the last end tag.
 154          $return .= substr($input, $offset);
 155  
 156          return $return;
 157      }
 158  }
PHP Cross Reference of Joomla 4.2.2 documentation

/administrator/components/com_finder/src/Indexer/Parser/ -> Html.php (source)