* @license GNU General Public License version 2 or later; see LICENSE.txt */ namespace Joomla\Component\Finder\Administrator\Indexer; use Exception; use Joomla\CMS\Component\ComponentHelper; use Joomla\CMS\Factory; use Joomla\CMS\Language\Multilanguage; use Joomla\CMS\Plugin\PluginHelper; use Joomla\CMS\Table\Table; use Joomla\Registry\Registry; use Joomla\String\StringHelper; // phpcs:disable PSR1.Files.SideEffects \defined('_JEXEC') or die; // phpcs:enable PSR1.Files.SideEffects /** * Helper class for the Finder indexer package. * * @since 2.5 */ class Helper { /** * Method to parse input into plain text. * * @param string $input The raw input. * @param string $format The format of the input. [optional] * * @return string The parsed input. * * @since 2.5 * @throws Exception on invalid parser. */ public static function parse($input, $format = 'html') { // Get a parser for the specified format and parse the input. return Parser::getInstance($format)->parse($input); } /** * Method to tokenize a text string. * * @param string $input The input to tokenize. * @param string $lang The language of the input. * @param boolean $phrase Flag to indicate whether input could be a phrase. [optional] * * @return Token[] An array of Token objects. * * @since 2.5 */ public static function tokenize($input, $lang, $phrase = false) { static $cache = [], $tuplecount; static $multilingual; static $defaultLanguage; if (!$tuplecount) { $params = ComponentHelper::getParams('com_finder'); $tuplecount = $params->get('tuplecount', 1); } if (is_null($multilingual)) { $multilingual = Multilanguage::isEnabled(); $config = ComponentHelper::getParams('com_finder'); if ($config->get('language_default', '') == '') { $defaultLang = '*'; } elseif ($config->get('language_default', '') == '-1') { $defaultLang = self::getDefaultLanguage(); } else { $defaultLang = $config->get('language_default'); } /* * The default language always has the language code '*'. * In order to not overwrite the language code of the language * object that we are using, we are cloning it here. */ $obj = Language::getInstance($defaultLang); $defaultLanguage = clone $obj; $defaultLanguage->language = '*'; } if (!$multilingual || $lang == '*') { $language = $defaultLanguage; } else { $language = Language::getInstance($lang); } if (!isset($cache[$lang])) { $cache[$lang] = []; } $tokens = array(); $terms = $language->tokenise($input); // @todo: array_filter removes any number 0's from the terms. Not sure this is entirely intended $terms = array_filter($terms); $terms = array_values($terms); /* * If we have to handle the input as a phrase, that means we don't * tokenize the individual terms and we do not create the two and three * term combinations. The phrase must contain more than one word! */ if ($phrase === true && count($terms) > 1) { // Create tokens from the phrase. $tokens[] = new Token($terms, $language->language, $language->spacer); } else { // Create tokens from the terms. for ($i = 0, $n = count($terms); $i < $n; $i++) { if (isset($cache[$lang][$terms[$i]])) { $tokens[] = $cache[$lang][$terms[$i]]; } else { $token = new Token($terms[$i], $language->language); $tokens[] = $token; $cache[$lang][$terms[$i]] = $token; } } // Create multi-word phrase tokens from the individual words. if ($tuplecount > 1) { for ($i = 0, $n = count($tokens); $i < $n; $i++) { $temp = array($tokens[$i]->term); // Create tokens for 2 to $tuplecount length phrases for ($j = 1; $j < $tuplecount; $j++) { if ($i + $j >= $n || !isset($tokens[$i + $j])) { break; } $temp[] = $tokens[$i + $j]->term; $key = implode('::', $temp); if (isset($cache[$lang][$key])) { $tokens[] = $cache[$lang][$key]; } else { $token = new Token($temp, $language->language, $language->spacer); $token->derived = true; $tokens[] = $token; $cache[$lang][$key] = $token; } } } } } // Prevent the cache to fill up the memory while (count($cache[$lang]) > 1024) { /** * We want to cache the most common words/tokens. At the same time * we don't want to cache too much. The most common words will also * be early in the text, so we are dropping all terms/tokens which * have been cached later. */ array_pop($cache[$lang]); } return $tokens; } /** * Method to get the base word of a token. * * @param string $token The token to stem. * @param string $lang The language of the token. * * @return string The root token. * * @since 2.5 */ public static function stem($token, $lang) { static $multilingual; static $defaultStemmer; if (is_null($multilingual)) { $multilingual = Multilanguage::isEnabled(); $config = ComponentHelper::getParams('com_finder'); if ($config->get('language_default', '') == '') { $defaultStemmer = Language::getInstance('*'); } elseif ($config->get('language_default', '') == '-1') { $defaultStemmer = Language::getInstance(self::getDefaultLanguage()); } else { $defaultStemmer = Language::getInstance($config->get('language_default')); } } if (!$multilingual || $lang == '*') { $language = $defaultStemmer; } else { $language = Language::getInstance($lang); } return $language->stem($token); } /** * Method to add a content type to the database. * * @param string $title The type of content. For example: PDF * @param string $mime The mime type of the content. For example: PDF [optional] * * @return integer The id of the content type. * * @since 2.5 * @throws Exception on database error. */ public static function addContentType($title, $mime = null) { static $types; $db = Factory::getDbo(); $query = $db->getQuery(true); // Check if the types are loaded. if (empty($types)) { // Build the query to get the types. $query->select('*') ->from($db->quoteName('#__finder_types')); // Get the types. $db->setQuery($query); $types = $db->loadObjectList('title'); } // Check if the type already exists. if (isset($types[$title])) { return (int) $types[$title]->id; } // Add the type. $query->clear() ->insert($db->quoteName('#__finder_types')) ->columns(array($db->quoteName('title'), $db->quoteName('mime'))) ->values($db->quote($title) . ', ' . $db->quote($mime)); $db->setQuery($query); $db->execute(); // Return the new id. return (int) $db->insertid(); } /** * Method to check if a token is common in a language. * * @param string $token The token to test. * @param string $lang The language to reference. * * @return boolean True if common, false otherwise. * * @since 2.5 */ public static function isCommon($token, $lang) { static $data, $default, $multilingual; if (is_null($multilingual)) { $multilingual = Multilanguage::isEnabled(); $config = ComponentHelper::getParams('com_finder'); if ($config->get('language_default', '') == '') { $default = '*'; } elseif ($config->get('language_default', '') == '-1') { $default = self::getPrimaryLanguage(self::getDefaultLanguage()); } else { $default = self::getPrimaryLanguage($config->get('language_default')); } } if (!$multilingual || $lang == '*') { $lang = $default; } // Load the common tokens for the language if necessary. if (!isset($data[$lang])) { $data[$lang] = self::getCommonWords($lang); } // Check if the token is in the common array. return in_array($token, $data[$lang], true); } /** * Method to get an array of common terms for a language. * * @param string $lang The language to use. * * @return array Array of common terms. * * @since 2.5 * @throws Exception on database error. */ public static function getCommonWords($lang) { $db = Factory::getDbo(); // Create the query to load all the common terms for the language. $query = $db->getQuery(true) ->select($db->quoteName('term')) ->from($db->quoteName('#__finder_terms_common')) ->where($db->quoteName('language') . ' = ' . $db->quote($lang)); // Load all of the common terms for the language. $db->setQuery($query); return $db->loadColumn(); } /** * Method to get the default language for the site. * * @return string The default language string. * * @since 2.5 */ public static function getDefaultLanguage() { static $lang; // We need to go to com_languages to get the site default language, it's the best we can guess. if (empty($lang)) { $lang = ComponentHelper::getParams('com_languages')->get('site', 'en-GB'); } return $lang; } /** * Method to parse a language/locale key and return a simple language string. * * @param string $lang The language/locale key. For example: en-GB * * @return string The simple language string. For example: en * * @since 2.5 */ public static function getPrimaryLanguage($lang) { static $data; // Only parse the identifier if necessary. if (!isset($data[$lang])) { if (is_callable(array('Locale', 'getPrimaryLanguage'))) { // Get the language key using the Locale package. $data[$lang] = \Locale::getPrimaryLanguage($lang); } else { // Get the language key using string position. $data[$lang] = StringHelper::substr($lang, 0, StringHelper::strpos($lang, '-')); } } return $data[$lang]; } /** * Method to get extra data for a content before being indexed. This is how * we add Comments, Tags, Labels, etc. that should be available to Finder. * * @param Result $item The item to index as a Result object. * * @return boolean True on success, false on failure. * * @since 2.5 * @throws Exception on database error. */ public static function getContentExtras(Result $item) { // Load the finder plugin group. PluginHelper::importPlugin('finder'); Factory::getApplication()->triggerEvent('onPrepareFinderContent', array(&$item)); return true; } /** * Method to process content text using the onContentPrepare event trigger. * * @param string $text The content to process. * @param Registry $params The parameters object. [optional] * @param Result $item The item which get prepared. [optional] * * @return string The processed content. * * @since 2.5 */ public static function prepareContent($text, $params = null, Result $item = null) { static $loaded; // Load the content plugins if necessary. if (empty($loaded)) { PluginHelper::importPlugin('content'); $loaded = true; } // Instantiate the parameter object if necessary. if (!($params instanceof Registry)) { $registry = new Registry($params); $params = $registry; } // Create a mock content object. $content = Table::getInstance('Content'); $content->text = $text; if ($item) { $content->bind((array) $item); $content->bind($item->getElements()); } if ($item && !empty($item->context)) { $content->context = $item->context; } // Fire the onContentPrepare event. Factory::getApplication()->triggerEvent('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0)); return $content->text; } }