PHPXRef 0.7.1 : Joomla 4.2.2 documentation : /libraries/vendor/wamania/php-stemmer/src/Stemmer/Dutch.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  namespace Wamania\Snowball\Stemmer;
   4  
   5  use voku\helper\UTF8;
   6  
   7  /**
   8   *
   9   * @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html
  10   * @author wamania
  11   *
  12   */
  13  class Dutch extends Stem
  14  {
  15      /**
  16       * All dutch vowels
  17       */
  18      protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'è');
  19  
  20      /**
  21       * {@inheritdoc}
  22       */
  23      public function stem($word)
  24      {
  25          // we do ALL in UTF-8
  26          if (!UTF8::is_utf8($word)) {
  27              throw new \Exception('Word must be in UTF-8');
  28          }
  29  
  30          $this->word = UTF8::strtolower($word);
  31  
  32          // First, remove all umlaut and acute accents.
  33          $this->word = UTF8::str_replace(
  34              array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
  35              array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
  36              $this->word);
  37  
  38          $this->plainVowels = implode('', self::$vowels);
  39  
  40          // Put initial y, y after a vowel, and i between vowels into upper case.
  41          $this->word = preg_replace('#^y#u', 'Y', $this->word);
  42          $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
  43          $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
  44  
  45          // R1 and R2 (see the note on R1 and R2) are then defined as in German.
  46          // R1 and R2 are first set up in the standard way
  47          $this->r1();
  48          $this->r2();
  49  
  50          // but then R1 is adjusted so that the region before it contains at least 3 letters.
  51          if ($this->r1Index < 3) {
  52              $this->r1Index = 3;
  53              $this->r1 = UTF8::substr($this->word, 3);
  54          }
  55  
  56          // Do each of steps 1, 2 3 and 4.
  57          $this->step1();
  58          $removedE = $this->step2();
  59          $this->step3a();
  60          $this->step3b($removedE);
  61          $this->step4();
  62          $this->finish();
  63  
  64          return $this->word;
  65      }
  66  
  67      /**
  68       * Define a valid s-ending as a non-vowel other than j.
  69       * @param string $ending
  70       * @return boolean
  71       */
  72      private function hasValidSEnding($word)
  73      {
  74          $lastLetter = UTF8::substr($word, -1, 1);
  75          return !in_array($lastLetter, array_merge(self::$vowels, array('j')));
  76      }
  77  
  78      /**
  79       * Define a valid en-ending as a non-vowel, and not gem.
  80       * @param string $ending
  81       * @return boolean
  82       */
  83      private function hasValidEnEnding($word)
  84      {
  85          $lastLetter = UTF8::substr($word, -1, 1);
  86          if (in_array($lastLetter, self::$vowels)) {
  87              return false;
  88          }
  89  
  90          $threeLastLetters = UTF8::substr($word, -3, 3);
  91          if ($threeLastLetters == 'gem') {
  92              return false;
  93          }
  94          return true;
  95      }
  96  
  97      /**
  98       *  Define undoubling the ending as removing the last letter if the word ends kk, dd or tt.
  99       */
 100      private function unDoubling()
 101      {
 102          if ($this->search(array('kk', 'dd', 'tt')) !== false) {
 103              $this->word = UTF8::substr($this->word, 0, -1);
 104          }
 105      }
 106  
 107      /**
 108       * Step 1
 109       * Search for the longest among the following suffixes, and perform the action indicated
 110       */
 111      private function step1()
 112      {
 113          // heden
 114          //      replace with heid if in R1
 115          if ( ($position = $this->search(array('heden'))) !== false) {
 116              if ($this->inR1($position)) {
 117                  $this->word = preg_replace('#(heden)$#u', 'heid', $this->word);
 118              }
 119              return true;
 120          }
 121  
 122          // en   ene
 123          //      delete if in R1 and preceded by a valid en-ending, and then undouble the ending
 124          if ( ($position = $this->search(array('ene', 'en'))) !== false) {
 125              if ($this->inR1($position)) {
 126                  $word = UTF8::substr($this->word, 0, $position);
 127                  if ($this->hasValidEnEnding($word)) {
 128                      $this->word = $word;
 129                      $this->unDoubling();
 130                  }
 131              }
 132              return true;
 133          }
 134  
 135          // s   se
 136          //      delete if in R1 and preceded by a valid s-ending
 137          if ( ($position = $this->search(array('se', 's'))) !== false) {
 138              if ($this->inR1($position)) {
 139                  $word = UTF8::substr($this->word, 0, $position);
 140                  if ($this->hasValidSEnding($word)) {
 141                      $this->word = $word;
 142                  }
 143              }
 144              return true;
 145          }
 146  
 147          return false;
 148      }
 149  
 150      /**
 151       * Step 2
 152       * Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
 153       */
 154      private function step2()
 155      {
 156          if ( ($position = $this->search(array('e'))) !== false) {
 157              if ($this->inR1($position)) {
 158                  $letter = UTF8::substr($this->word, -2, 1);
 159                  if (!in_array($letter, self::$vowels)) {
 160                      $this->word = UTF8::substr($this->word, 0, $position);
 161                      $this->unDoubling();
 162  
 163                      return true;
 164                  }
 165              }
 166          }
 167  
 168          return false;
 169      }
 170  
 171      /**
 172       * Step 3a: heid
 173       * delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
 174       */
 175      private function step3a()
 176      {
 177          if ( ($position = $this->search(array('heid'))) !== false) {
 178              if ($this->inR2($position)) {
 179                  $letter = UTF8::substr($this->word, -5, 1);
 180                  if ($letter !== 'c') {
 181                      $this->word = UTF8::substr($this->word, 0, $position);
 182  
 183                      if ( ($position = $this->search(array('en'))) !== false) {
 184                          if ($this->inR1($position)) {
 185                              $word = UTF8::substr($this->word, 0, $position);
 186                              if ($this->hasValidEnEnding($word)) {
 187                                  $this->word = $word;
 188                                  $this->unDoubling();
 189                              }
 190                          }
 191                      }
 192                  }
 193              }
 194          }
 195  
 196      }
 197  
 198      /**
 199       * Step 3b: d-suffixe
 200       * Search for the longest among the following suffixes, and perform the action indicated.
 201       */
 202      private function step3b($removedE)
 203      {
 204          // end   ing
 205          //      delete if in R2
 206          //      if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending
 207          if ( ($position = $this->search(array('end', 'ing'))) !== false) {
 208              if ($this->inR2($position)) {
 209                  $this->word = UTF8::substr($this->word, 0, $position);
 210  
 211                  if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) {
 212                      $letter = UTF8::substr($this->word, -3, 1);
 213                      if ($letter !== 'e') {
 214                          $this->word = UTF8::substr($this->word, 0, $position2);
 215                      }
 216                  } else {
 217                      $this->unDoubling();
 218                  }
 219              }
 220  
 221  
 222              return true;
 223          }
 224  
 225          // ig
 226          //      delete if in R2 and not preceded by e
 227          if ( ($position = $this->search(array('ig'))) !== false) {
 228              if ($this->inR2($position)) {
 229                  $letter = UTF8::substr($this->word, -3, 1);
 230                  if ($letter !== 'e') {
 231                      $this->word = UTF8::substr($this->word, 0, $position);
 232                  }
 233              }
 234              return true;
 235          }
 236  
 237          // lijk
 238          //      delete if in R2, and then repeat step 2
 239          if ( ($position = $this->search(array('lijk'))) !== false) {
 240              if ($this->inR2($position)) {
 241                  $this->word = UTF8::substr($this->word, 0, $position);
 242                  $this->step2();
 243              }
 244              return true;
 245          }
 246  
 247          // baar
 248          //      delete if in R2
 249          if ( ($position = $this->search(array('baar'))) !== false) {
 250              if ($this->inR2($position)) {
 251                  $this->word = UTF8::substr($this->word, 0, $position);
 252              }
 253              return true;
 254          }
 255  
 256          // bar
 257          //      delete if in R2 and if step 2 actually removed an e
 258          if ( ($position = $this->search(array('bar'))) !== false) {
 259              if ($this->inR2($position) && $removedE) {
 260                  $this->word = UTF8::substr($this->word, 0, $position);
 261              }
 262              return true;
 263          }
 264  
 265          return false;
 266      }
 267  
 268      /**
 269       * Step 4: undouble vowel
 270       * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u,
 271       * remove one of the vowels from V (for example, maan -> man, brood -> brod).
 272       */
 273      private function step4()
 274      {
 275          // D is a non-vowel other than I
 276          $d = UTF8::substr($this->word, -1, 1);
 277          if (in_array($d, array_merge(self::$vowels, array('I')))) {
 278              return false;
 279          }
 280  
 281          // V is double a, e, o or u
 282          $v = UTF8::substr($this->word, -3, 2);
 283          if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) {
 284              return false;
 285          }
 286          $singleV = UTF8::substr($v, 0, 1);
 287  
 288          // C is a non-vowel
 289          $c = UTF8::substr($this->word, -4, 1);
 290          if (in_array($c, self::$vowels)) {
 291              return false;
 292          }
 293  
 294          $this->word = UTF8::substr($this->word, 0, -4);
 295          $this->word .= $c . $singleV  .$d;
 296      }
 297  
 298      /**
 299       * Finally
 300       * Turn I and Y back into lower case.
 301       */
 302      private function finish()
 303      {
 304          $this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word);
 305      }
 306  }
PHP Cross Reference of Joomla 4.2.2 documentation

/libraries/vendor/wamania/php-stemmer/src/Stemmer/ -> Dutch.php (source)