PHPXRef 0.7.1 : Joomla 4.2.2 documentation : /libraries/vendor/wamania/php-stemmer/src/Stemmer/English.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  namespace Wamania\Snowball\Stemmer;
   4  
   5  use voku\helper\UTF8;
   6  
   7  /**
   8   * English Porter 2
   9   *
  10   * @link http://snowball.tartarus.org/algorithms/english/stemmer.html
  11   * @author wamania
  12   *
  13   */
  14  class English extends Stem
  15  {
  16      /**
  17       * All english vowels
  18       */
  19      protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
  20  
  21      protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
  22  
  23      protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't');
  24  
  25      /**
  26       * {@inheritdoc}
  27       */
  28      public function stem($word)
  29      {
  30          // we do ALL in UTF-8
  31          if (!UTF8::is_utf8($word)) {
  32              throw new \Exception('Word must be in UTF-8');
  33          }
  34  
  35          if (Utf8::strlen($word) < 3) {
  36              return $word;
  37          }
  38  
  39          $this->word = UTF8::strtolower($word);
  40  
  41          // exceptions
  42          if (null !== ($word = $this->exception1())) {
  43              return $word;
  44          }
  45  
  46  
  47          $this->plainVowels = implode('', self::$vowels);
  48  
  49          // Remove initial ', if present.
  50          $first = UTF8::substr($this->word, 0, 1);
  51          if ($first == "'") {
  52              $this->word = UTF8::substr($this->word, 1);
  53          }
  54  
  55          // Set initial y, or y after a vowel, to Y
  56          if ($first == 'y') {
  57              $this->word = preg_replace('#^y#u', 'Y', $this->word);
  58          }
  59          $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
  60  
  61          $this->r1();
  62          $this->exceptionR1();
  63          $this->r2();
  64  
  65          $this->step0();
  66          $this->step1a();
  67  
  68          // exceptions 2
  69          if (null !== ($word = $this->exception2())) {
  70              return $word;
  71          }
  72  
  73          $this->step1b();
  74          $this->step1c();
  75          $this->step2();
  76          $this->step3();
  77          $this->step4();
  78          $this->step5();
  79          $this->finish();
  80  
  81          return $this->word;
  82      }
  83  
  84      /**
  85       * Step 0
  86       * Remove ', 's, 's'
  87       */
  88      private function step0()
  89      {
  90          if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) {
  91              $this->word = UTF8::substr($this->word, 0, $position);
  92          }
  93      }
  94  
  95      private function step1a()
  96      {
  97          // sses
  98          //      replace by ss
  99          if ( ($position = $this->search(array('sses'))) !== false) {
 100              $this->word = preg_replace('#(sses)$#u', 'ss', $this->word);
 101              return true;
 102          }
 103  
 104          // ied+   ies*
 105          //      replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
 106          if ( ($position = $this->search(array('ied', 'ies'))) !== false) {
 107              if ($position > 1) {
 108                  $this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word);
 109  
 110              } else {
 111                  $this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word);
 112              }
 113              return true;
 114          }
 115  
 116          // us+   ss
 117          //  do nothing
 118          if ( ($position = $this->search(array('us', 'ss'))) !== false) {
 119              return true;
 120          }
 121  
 122          // s
 123          //      delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
 124          if ( ($position = $this->search(array('s'))) !== false) {
 125              for ($i=0; $i<$position-1; $i++) {
 126                  $letter = UTF8::substr($this->word, $i, 1);
 127  
 128                  if (in_array($letter, self::$vowels)) {
 129                      $this->word = UTF8::substr($this->word, 0, $position);
 130                      return true;
 131                  }
 132              }
 133              return true;
 134          }
 135  
 136          return false;
 137      }
 138  
 139      /**
 140       * Step 1b
 141       */
 142      private function step1b()
 143      {
 144          // eed   eedly+
 145          //      replace by ee if in R1
 146          if ( ($position = $this->search(array('eedly', 'eed'))) !== false) {
 147              if ($this->inR1($position)) {
 148                  $this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word);
 149              }
 150              return true;
 151          }
 152  
 153          // ed   edly+   ing   ingly+
 154          //      delete if the preceding word part contains a vowel, and after the deletion:
 155          //      if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
 156          //      if the word ends with a double remove the last letter (so hopp -> hop), or
 157          //      if the word is short, add e (so hop -> hope)
 158          if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) {
 159              for ($i=0; $i<$position; $i++) {
 160                  $letter = UTF8::substr($this->word, $i, 1);
 161  
 162                  if (in_array($letter, self::$vowels)) {
 163                      $this->word = UTF8::substr($this->word, 0, $position);
 164  
 165                      if ($this->search(array('at', 'bl', 'iz')) !== false) {
 166                          $this->word .= 'e';
 167  
 168                      } elseif ( ($position2 = $this->search(self::$doubles)) !== false) {
 169                          $this->word = UTF8::substr($this->word, 0, ($position2+1));
 170  
 171                      } elseif ($this->isShort()) {
 172                          $this->word .= 'e';
 173                      }
 174  
 175                      return true;
 176                  }
 177              }
 178              return true;
 179          }
 180  
 181          return false;
 182      }
 183  
 184      /**
 185       * Step 1c: *
 186       */
 187      private function step1c()
 188      {
 189          // replace suffix y or Y by i if preceded by a non-vowel
 190          // which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
 191          $length = UTF8::strlen($this->word);
 192  
 193          if ($length < 3) {
 194              return true;
 195          }
 196  
 197          if ( ($position = $this->search(array('y', 'Y'))) !== false) {
 198              $before = $position - 1;
 199              $letter = UTF8::substr($this->word, $before, 1);
 200  
 201              if (! in_array($letter, self::$vowels)) {
 202                  $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word);
 203              }
 204  
 205              return true;
 206          }
 207  
 208          return false;
 209      }
 210  
 211      /**
 212       * Step 2
 213       *  Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
 214       */
 215      private function step2()
 216      {
 217          // iveness   iviti:   replace by ive
 218          if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) {
 219              if ($this->inR1($position)) {
 220                  $this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word);
 221              }
 222              return true;
 223          }
 224  
 225          // ousli   ousness:   replace by ous
 226          if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) {
 227              if ($this->inR1($position)) {
 228                  $this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word);
 229              }
 230              return true;
 231          }
 232  
 233          // izer   ization:   replace by ize
 234          if ( ($position = $this->search(array('izer', 'ization'))) !== false) {
 235              if ($this->inR1($position)) {
 236                  $this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word);
 237              }
 238              return true;
 239          }
 240  
 241          // ational   ation   ator:   replace by ate
 242          if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) {
 243              if ($this->inR1($position)) {
 244                  $this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word);
 245              }
 246              return true;
 247          }
 248  
 249          // biliti   bli+:   replace by ble
 250          if ( ($position = $this->search(array('biliti', 'bli'))) !== false) {
 251              if ($this->inR1($position)) {
 252                  $this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word);
 253              }
 254              return true;
 255          }
 256  
 257          // lessli+:   replace by less
 258          if ( ($position = $this->search(array('lessli'))) !== false) {
 259              if ($this->inR1($position)) {
 260                  $this->word = preg_replace('#(lessli)$#u', 'less', $this->word);
 261              }
 262              return true;
 263          }
 264  
 265          // fulness:   replace by ful
 266          if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) {
 267              if ($this->inR1($position)) {
 268                  $this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word);
 269              }
 270              return true;
 271          }
 272  
 273          // tional:   replace by tion
 274          if ( ($position = $this->search(array('tional'))) !== false) {
 275              if ($this->inR1($position)) {
 276                  $this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
 277              }
 278              return true;
 279          }
 280  
 281          // alism   aliti   alli:   replace by al
 282          if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) {
 283              if ($this->inR1($position)) {
 284                  $this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word);
 285              }
 286              return true;
 287          }
 288  
 289          // enci:   replace by ence
 290          if ( ($position = $this->search(array('enci'))) !== false) {
 291              if ($this->inR1($position)) {
 292                  $this->word = preg_replace('#(enci)$#u', 'ence', $this->word);
 293              }
 294              return true;
 295          }
 296  
 297          // anci:   replace by ance
 298          if ( ($position = $this->search(array('anci'))) !== false) {
 299              if ($this->inR1($position)) {
 300                  $this->word = preg_replace('#(anci)$#u', 'ance', $this->word);
 301              }
 302              return true;
 303          }
 304  
 305          // abli:   replace by able
 306          if ( ($position = $this->search(array('abli'))) !== false) {
 307              if ($this->inR1($position)) {
 308                  $this->word = preg_replace('#(abli)$#u', 'able', $this->word);
 309              }
 310              return true;
 311          }
 312  
 313          // entli:   replace by ent
 314          if ( ($position = $this->search(array('entli'))) !== false) {
 315              if ($this->inR1($position)) {
 316                  $this->word = preg_replace('#(entli)$#u', 'ent', $this->word);
 317              }
 318              return true;
 319          }
 320  
 321          // ogi+:   replace by og if preceded by l
 322          if ( ($position = $this->search(array('ogi'))) !== false) {
 323  
 324              if ($this->inR1($position)) {
 325                  $before = $position - 1;
 326                  $letter = UTF8::substr($this->word, $before, 1);
 327  
 328                  if ($letter == 'l') {
 329                      $this->word = preg_replace('#(ogi)$#u', 'og', $this->word);
 330                  }
 331              }
 332  
 333              return true;
 334          }
 335  
 336          // li+:   delete if preceded by a valid li-ending
 337          if ( ($position = $this->search(array('li'))) !== false) {
 338  
 339              if ($this->inR1($position)) {
 340                  // a letter for you
 341                  $letter = UTF8::substr($this->word, ($position-1), 1);
 342  
 343                  if (in_array($letter, self::$liEnding)) {
 344                      $this->word = UTF8::substr($this->word, 0, $position);
 345                  }
 346              }
 347  
 348              return true;
 349          }
 350  
 351          return false;
 352      }
 353  
 354      /**
 355       * Step 3:
 356       * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
 357       */
 358      private function step3()
 359      {
 360          // ational+:   replace by ate
 361          if ($this->searchIfInR1(array('ational')) !== false) {
 362              $this->word = preg_replace('#(ational)$#u', 'ate', $this->word);
 363              return true;
 364          }
 365  
 366          // tional+:   replace by tion
 367          if ($this->searchIfInR1(array('tional')) !== false) {
 368              $this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
 369              return true;
 370          }
 371  
 372          // alize:   replace by al
 373          if ($this->searchIfInR1(array('alize')) !== false) {
 374              $this->word = preg_replace('#(alize)$#u', 'al', $this->word);
 375              return true;
 376          }
 377  
 378          // icate   iciti   ical:   replace by ic
 379          if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) {
 380              $this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word);
 381              return true;
 382          }
 383  
 384          // ful   ness:   delete
 385          if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) {
 386              $this->word = UTF8::substr($this->word, 0, $position);
 387              return true;
 388          }
 389  
 390          // ative*:   delete if in R2
 391          if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) )  {
 392              $this->word = UTF8::substr($this->word, 0, $position);
 393              return true;
 394          }
 395  
 396          return false;
 397      }
 398  
 399      /**
 400       * Step 4
 401       * Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
 402       */
 403      private function step4()
 404      {
 405          //    ement  ance   ence  able ible   ant  ment   ent   ism   ate   iti   ous   ive   ize al  er   ic
 406          //      delete
 407          if ( ($position = $this->search(array(
 408              'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism',
 409              'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) {
 410  
 411              if ($this->inR2($position)) {
 412                  $this->word = UTF8::substr($this->word, 0, $position);
 413              }
 414              return true;
 415          }
 416  
 417          // ion
 418          //      delete if preceded by s or t
 419          if ( ($position = $this->searchIfInR2(array('ion'))) !== false) {
 420              $before = $position - 1;
 421              $letter = UTF8::substr($this->word, $before, 1);
 422  
 423              if ($letter == 's' || $letter == 't') {
 424                  $this->word = UTF8::substr($this->word, 0, $position);
 425              }
 426  
 427              return true;
 428          }
 429  
 430          return false;
 431      }
 432  
 433      /**
 434       * Step 5: *
 435       * Search for the the following suffixes, and, if found, perform the action indicated.
 436       */
 437      private function step5()
 438      {
 439          // e
 440          //      delete if in R2, or in R1 and not preceded by a short syllable
 441          if ( ($position = $this->search(array('e'))) !== false) {
 442              if ($this->inR2($position)) {
 443                  $this->word = UTF8::substr($this->word, 0, $position);
 444  
 445              } elseif ($this->inR1($position)) {
 446                  if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) {
 447                      $this->word = UTF8::substr($this->word, 0, $position);
 448                  }
 449              }
 450  
 451              return true;
 452          }
 453  
 454          // l
 455          //      delete if in R2 and preceded by l
 456          if ( ($position = $this->searchIfInR2(array('l'))) !== false) {
 457              $before = $position - 1;
 458              $letter = UTF8::substr($this->word, $before, 1);
 459  
 460              if ($letter == 'l') {
 461                  $this->word = UTF8::substr($this->word, 0, $position);
 462              }
 463  
 464              return true;
 465          }
 466  
 467          return false;
 468      }
 469  
 470      private function finish()
 471      {
 472          $this->word = UTF8::str_replace('Y', 'y', $this->word);
 473      }
 474  
 475      private function exceptionR1()
 476      {
 477          if (Utf8::strpos($this->word, 'gener') === 0) {
 478              $this->r1 = UTF8::substr($this->word, 5);
 479              $this->r1Index = 5;
 480  
 481          } elseif (Utf8::strpos($this->word, 'commun') === 0) {
 482              $this->r1 = UTF8::substr($this->word, 6);
 483              $this->r1Index = 6;
 484  
 485          } elseif (Utf8::strpos($this->word, 'arsen') === 0) {
 486              $this->r1 = UTF8::substr($this->word, 5);
 487              $this->r1Index = 5;
 488          }
 489      }
 490  
 491      /**
 492       *  1/ Stem certain special words as follows,
 493       *  2/ If one of the following is found, leave it invariant,
 494       */
 495      private function exception1()
 496      {
 497          $exceptions = array(
 498              'skis'   => 'ski',
 499              'skies'  => 'sky',
 500              'dying'  => 'die',
 501              'lying'  => 'lie',
 502              'tying'  => 'tie',
 503              'idly'   => 'idl',
 504              'gently' => 'gentl',
 505              'ugly'   => 'ugli',
 506              'early'  => 'earli',
 507              'only'   => 'onli',
 508              'singly' => 'singl',
 509              // invariants
 510              'sky'    => 'sky',
 511              'news'   => 'news',
 512              'howe'   => 'howe',
 513              'atlas'  => 'atlas',
 514              'cosmos' => 'cosmos',
 515              'bias'   => 'bias',
 516              'andes'  => 'andes'
 517          );
 518  
 519          if (isset($exceptions[$this->word])) {
 520              return $exceptions[$this->word];
 521          }
 522  
 523          return null;
 524      }
 525  
 526      /**
 527       * Following step 1a, leave the following invariant,
 528       */
 529      private function exception2()
 530      {
 531          $exceptions = array(
 532              'inning' => 'inning',
 533              'outing' => 'outing',
 534              'canning' => 'canning',
 535              'herring' => 'herring',
 536              'earring' => 'earring',
 537              'proceed' => 'proceed',
 538              'exceed'  => 'exceed',
 539              'succeed' => 'succeed'
 540          );
 541  
 542          if (isset($exceptions[$this->word])) {
 543              return $exceptions[$this->word];
 544          }
 545  
 546          return null;
 547      }
 548  
 549      /**
 550       *  A word is called short if it ends in a short syllable, and if R1 is null.
 551       *  Note : R1 not really null, but the word at this state must be smaller than r1 index
 552       *
 553       *  @return boolean
 554       */
 555      private function isShort()
 556      {
 557          $length = UTF8::strlen($this->word);
 558          return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) );
 559      }
 560  
 561      /**
 562       * Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel,
 563       *  or * (b) a vowel at the beginning of the word followed by a non-vowel.
 564       *
 565       *  So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables.
 566       *  But uproot, bestow, disturb do not end with a short syllable.
 567       */
 568      private function searchShortSyllabe($from, $nbLetters)
 569      {
 570          $length = UTF8::strlen($this->word);
 571  
 572          if ($from < 0) {
 573              $from = $length + $from;
 574          }
 575          if ($from < 0) {
 576              $from = 0;
 577          }
 578  
 579          // (a) is just for beginning of the word
 580          if ( ($nbLetters == 2) && ($from != 0) ) {
 581              return false;
 582          }
 583  
 584          $first = UTF8::substr($this->word, $from, 1);
 585          $second = UTF8::substr($this->word, ($from+1), 1);
 586  
 587          if ($nbLetters == 2) {
 588              if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) {
 589                  return true;
 590              }
 591          }
 592  
 593          $third = UTF8::substr($this->word, ($from+2), 1);
 594  
 595          if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels))
 596              && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) {
 597                  return true;
 598              }
 599  
 600          return false;
 601      }
 602  }
PHP Cross Reference of Joomla 4.2.2 documentation

/libraries/vendor/wamania/php-stemmer/src/Stemmer/ -> English.php (source)