PHPXRef 0.7.1 : Joomla 4.2.2 documentation : /libraries/vendor/wamania/php-stemmer/src/Stemmer/Italian.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  namespace Wamania\Snowball\Stemmer;
   4  
   5  use voku\helper\UTF8;
   6  
   7  /**
   8   *
   9   * @link http://snowball.tartarus.org/algorithms/italian/stemmer.html
  10   * @author wamania
  11   *
  12   */
  13  class Italian extends Stem
  14  {
  15      /**
  16       * All Italian vowels
  17       */
  18      protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'à', 'è', 'ì', 'ò', 'ù');
  19  
  20      /**
  21       * {@inheritdoc}
  22       */
  23      public function stem($word)
  24      {
  25          // we do ALL in UTF-8
  26          if (!UTF8::is_utf8($word)) {
  27              throw new \Exception('Word must be in UTF-8');
  28          }
  29  
  30          $this->plainVowels = implode('', self::$vowels);
  31  
  32          $this->word = UTF8::strtolower($word);
  33  
  34          // First, replace all acute accents by grave accents.
  35          $this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word);
  36  
  37          //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then
  38          $this->word = preg_replace('#([q])u#u', '$1U', $this->word);
  39          $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
  40          $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
  41  
  42          $this->rv();
  43          $this->r1();
  44          $this->r2();
  45  
  46          $this->step0();
  47  
  48          $word = $this->word;
  49          $this->step1();
  50  
  51          //Do step 2 if no ending was removed by step 1.
  52          if ($word == $this->word) {
  53              $this->step2();
  54          }
  55  
  56          $this->step3a();
  57          $this->step3b();
  58          $this->finish();
  59  
  60          return $this->word;
  61      }
  62  
  63      /**
  64       * Step 0: Attached pronoun
  65       */
  66      private function step0()
  67      {
  68          // Search for the longest among the following suffixes
  69          if ( ($position = $this->search(array(
  70              'gliela', 'gliele', 'glieli', 'glielo', 'gliene',
  71              'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela',
  72              'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene',
  73              'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) {
  74  
  75              $suffixe = UTF8::substr($this->word, $position);
  76  
  77              // following one of (in RV)
  78               // a
  79              $a = array('ando', 'endo');
  80              $a = array_map(function($item) use ($suffixe) {
  81                  return $item . $suffixe;
  82              }, $a);
  83              // In case of (a) the suffix is deleted
  84              if ($this->searchIfInRv($a) !== false) {
  85                  $this->word = UTF8::substr($this->word, 0, $position);
  86              }
  87  
  88              //b
  89              $b = array('ar', 'er', 'ir');
  90              $b = array_map(function($item) use ($suffixe) {
  91                  return $item . $suffixe;
  92              }, $b);
  93              // in case (b) it is replace by e
  94              if ($this->searchIfInRv($b) !== false) {
  95                  $this->word = preg_replace('#('.$suffixe.')$#u', 'e', $this->word);
  96              }
  97  
  98              return true;
  99          }
 100  
 101          return false;
 102      }
 103  
 104      /**
 105       * Step 1: Standard suffix removal
 106       */
 107      private function step1()
 108      {
 109          // amente
 110          //      delete if in R1
 111          //      if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
 112          //      if preceded by os, ic or abil, delete if in R2
 113          if ( ($position = $this->search(array('amente'))) !== false) {
 114              if ($this->inR1($position)) {
 115                  $this->word = UTF8::substr($this->word, 0, $position);
 116              }
 117  
 118              // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
 119              if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
 120                  $this->word = UTF8::substr($this->word, 0, $position2);
 121                  if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
 122                      $this->word = UTF8::substr($this->word, 0, $position3);
 123                  }
 124  
 125                  // if preceded by os, ic or ad, delete if in R2
 126              } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) {
 127                  $this->word = UTF8::substr($this->word, 0, $position4);
 128              }
 129              return true;
 130          }
 131  
 132          // delete if in R2
 133          if ( ($position = $this->search(array(
 134              'ibili', 'atrice', 'abili', 'abile', 'ibile', 'atrici', 'mente',
 135              'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'ante', 'anti',
 136              'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose'
 137          ))) !== false) {
 138  
 139              if ($this->inR2($position)) {
 140                  $this->word = UTF8::substr($this->word, 0, $position);
 141              }
 142              return true;
 143          }
 144  
 145          // azione   azioni   atore   atori
 146          //      delete if in R2
 147          //      if preceded by ic, delete if in R2
 148          if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) {
 149              if ($this->inR2($position)) {
 150                  $this->word = UTF8::substr($this->word, 0, $position);
 151  
 152                  if ( ($position2 = $this->search(array('ic'))) !== false) {
 153                      if ($this->inR2($position2)) {
 154                          $this->word = UTF8::substr($this->word, 0, $position2);
 155                      }
 156                  }
 157              }
 158              return true;
 159          }
 160  
 161          // logia   logie
 162          //      replace with log if in R2
 163          if ( ($position = $this->search(array('logia', 'logie'))) !== false) {
 164              if ($this->inR2($position)) {
 165                  $this->word = preg_replace('#(logia|logie)$#u', 'log', $this->word);
 166              }
 167              return true;
 168          }
 169  
 170          // uzione   uzioni   usione   usioni
 171          //      replace with u if in R2
 172          if ( ($position = $this->search(array('uzione', 'uzioni', 'usione', 'usioni'))) !== false) {
 173              if ($this->inR2($position)) {
 174                  $this->word = preg_replace('#(uzione|uzioni|usione|usioni)$#u', 'u', $this->word);
 175              }
 176              return true;
 177          }
 178  
 179          // enza   enze
 180          //      replace with ente if in R2
 181          if ( ($position = $this->search(array('enza', 'enze'))) !== false) {
 182              if ($this->inR2($position)) {
 183                  $this->word = preg_replace('#(enza|enze)$#u', 'ente', $this->word);
 184              }
 185              return true;
 186          }
 187  
 188          // amento   amenti   imento   imenti
 189          //      delete if in RV
 190          if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) {
 191              if ($this->inRv($position)) {
 192                  $this->word = UTF8::substr($this->word, 0, $position);
 193              }
 194              return true;
 195          }
 196  
 197          // ità
 198          //      delete if in R2
 199          //      if preceded by abil, ic or iv, delete if in R2
 200          if ( ($position = $this->search(array('ità'))) !== false) {
 201              if ($this->inR2($position)) {
 202                  $this->word = UTF8::substr($this->word, 0, $position);
 203              }
 204  
 205              if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
 206                  $this->word = UTF8::substr($this->word, 0, $position2);
 207              }
 208              return true;
 209          }
 210  
 211          // ivo   ivi   iva   ive
 212          //      delete if in R2
 213          //      if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2)
 214          if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) {
 215              if ($this->inR2($position)) {
 216                  $this->word = UTF8::substr($this->word, 0, $position);
 217              }
 218  
 219              if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
 220                  $this->word = UTF8::substr($this->word, 0, $position2);
 221                  if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) {
 222                      $this->word = UTF8::substr($this->word, 0, $position3);
 223                  }
 224              }
 225              return true;
 226          }
 227  
 228          return false;
 229      }
 230  
 231      /**
 232       * Step 2: Verb suffixes
 233       * Search for the longest among the following suffixes in RV, and if found, delete.
 234       */
 235      private function step2()
 236      {
 237          if ( ($position = $this->searchIfInRv(array(
 238              'assimo', 'assero', 'eranno', 'erebbero', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbero', 'irebbe', 'iremmo',
 239              'iscano', 'ireste', 'iresti', 'iscono', 'issero',
 240              'avamo', 'arono', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'ivamo', 'ivano', 'ivate', 'iremo', 'irete', 'irono',
 241              'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'irei',
 242              'isca', 'isce', 'isci', 'isco',
 243              'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva',
 244              'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) {
 245  
 246              $this->word = UTF8::substr($this->word, 0, $position);
 247          }
 248      }
 249  
 250      /**
 251       * Step 3a
 252       * Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV
 253       */
 254      private function step3a()
 255      {
 256          if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) {
 257              $this->word = UTF8::substr($this->word, 0, -1);
 258  
 259              if ($this->searchIfInRv(array('i')) !== false) {
 260                  $this->word = UTF8::substr($this->word, 0, -1);
 261              }
 262              return true;
 263          }
 264          return false;
 265      }
 266  
 267      /**
 268       * Step 3b
 269       * Replace final ch (or gh) with c (or g) if in RV (crocch -> crocc)
 270       */
 271      private function step3b()
 272      {
 273          if ($this->searchIfInRv(array('ch')) !== false) {
 274              $this->word = preg_replace('#(ch)$#u', 'c', $this->word);
 275  
 276          } elseif ($this->searchIfInRv(array('gh')) !== false) {
 277              $this->word = preg_replace('#(gh)$#u', 'g', $this->word);
 278          }
 279      }
 280  
 281      /**
 282       * Finally
 283       * turn I and U back into lower case
 284       */
 285      private function finish()
 286      {
 287          $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
 288      }
 289  }
PHP Cross Reference of Joomla 4.2.2 documentation

/libraries/vendor/wamania/php-stemmer/src/Stemmer/ -> Italian.php (source)