[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace Wamania\Snowball\Stemmer; 4 5 use voku\helper\UTF8; 6 7 /** 8 * 9 * @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html 10 * @author wamania 11 * 12 */ 13 class Dutch extends Stem 14 { 15 /** 16 * All dutch vowels 17 */ 18 protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'è'); 19 20 /** 21 * {@inheritdoc} 22 */ 23 public function stem($word) 24 { 25 // we do ALL in UTF-8 26 if (!UTF8::is_utf8($word)) { 27 throw new \Exception('Word must be in UTF-8'); 28 } 29 30 $this->word = UTF8::strtolower($word); 31 32 // First, remove all umlaut and acute accents. 33 $this->word = UTF8::str_replace( 34 array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), 35 array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), 36 $this->word); 37 38 $this->plainVowels = implode('', self::$vowels); 39 40 // Put initial y, y after a vowel, and i between vowels into upper case. 41 $this->word = preg_replace('#^y#u', 'Y', $this->word); 42 $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); 43 $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); 44 45 // R1 and R2 (see the note on R1 and R2) are then defined as in German. 46 // R1 and R2 are first set up in the standard way 47 $this->r1(); 48 $this->r2(); 49 50 // but then R1 is adjusted so that the region before it contains at least 3 letters. 51 if ($this->r1Index < 3) { 52 $this->r1Index = 3; 53 $this->r1 = UTF8::substr($this->word, 3); 54 } 55 56 // Do each of steps 1, 2 3 and 4. 57 $this->step1(); 58 $removedE = $this->step2(); 59 $this->step3a(); 60 $this->step3b($removedE); 61 $this->step4(); 62 $this->finish(); 63 64 return $this->word; 65 } 66 67 /** 68 * Define a valid s-ending as a non-vowel other than j. 69 * @param string $ending 70 * @return boolean 71 */ 72 private function hasValidSEnding($word) 73 { 74 $lastLetter = UTF8::substr($word, -1, 1); 75 return !in_array($lastLetter, array_merge(self::$vowels, array('j'))); 76 } 77 78 /** 79 * Define a valid en-ending as a non-vowel, and not gem. 80 * @param string $ending 81 * @return boolean 82 */ 83 private function hasValidEnEnding($word) 84 { 85 $lastLetter = UTF8::substr($word, -1, 1); 86 if (in_array($lastLetter, self::$vowels)) { 87 return false; 88 } 89 90 $threeLastLetters = UTF8::substr($word, -3, 3); 91 if ($threeLastLetters == 'gem') { 92 return false; 93 } 94 return true; 95 } 96 97 /** 98 * Define undoubling the ending as removing the last letter if the word ends kk, dd or tt. 99 */ 100 private function unDoubling() 101 { 102 if ($this->search(array('kk', 'dd', 'tt')) !== false) { 103 $this->word = UTF8::substr($this->word, 0, -1); 104 } 105 } 106 107 /** 108 * Step 1 109 * Search for the longest among the following suffixes, and perform the action indicated 110 */ 111 private function step1() 112 { 113 // heden 114 // replace with heid if in R1 115 if ( ($position = $this->search(array('heden'))) !== false) { 116 if ($this->inR1($position)) { 117 $this->word = preg_replace('#(heden)$#u', 'heid', $this->word); 118 } 119 return true; 120 } 121 122 // en ene 123 // delete if in R1 and preceded by a valid en-ending, and then undouble the ending 124 if ( ($position = $this->search(array('ene', 'en'))) !== false) { 125 if ($this->inR1($position)) { 126 $word = UTF8::substr($this->word, 0, $position); 127 if ($this->hasValidEnEnding($word)) { 128 $this->word = $word; 129 $this->unDoubling(); 130 } 131 } 132 return true; 133 } 134 135 // s se 136 // delete if in R1 and preceded by a valid s-ending 137 if ( ($position = $this->search(array('se', 's'))) !== false) { 138 if ($this->inR1($position)) { 139 $word = UTF8::substr($this->word, 0, $position); 140 if ($this->hasValidSEnding($word)) { 141 $this->word = $word; 142 } 143 } 144 return true; 145 } 146 147 return false; 148 } 149 150 /** 151 * Step 2 152 * Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending 153 */ 154 private function step2() 155 { 156 if ( ($position = $this->search(array('e'))) !== false) { 157 if ($this->inR1($position)) { 158 $letter = UTF8::substr($this->word, -2, 1); 159 if (!in_array($letter, self::$vowels)) { 160 $this->word = UTF8::substr($this->word, 0, $position); 161 $this->unDoubling(); 162 163 return true; 164 } 165 } 166 } 167 168 return false; 169 } 170 171 /** 172 * Step 3a: heid 173 * delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b) 174 */ 175 private function step3a() 176 { 177 if ( ($position = $this->search(array('heid'))) !== false) { 178 if ($this->inR2($position)) { 179 $letter = UTF8::substr($this->word, -5, 1); 180 if ($letter !== 'c') { 181 $this->word = UTF8::substr($this->word, 0, $position); 182 183 if ( ($position = $this->search(array('en'))) !== false) { 184 if ($this->inR1($position)) { 185 $word = UTF8::substr($this->word, 0, $position); 186 if ($this->hasValidEnEnding($word)) { 187 $this->word = $word; 188 $this->unDoubling(); 189 } 190 } 191 } 192 } 193 } 194 } 195 196 } 197 198 /** 199 * Step 3b: d-suffixe 200 * Search for the longest among the following suffixes, and perform the action indicated. 201 */ 202 private function step3b($removedE) 203 { 204 // end ing 205 // delete if in R2 206 // if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending 207 if ( ($position = $this->search(array('end', 'ing'))) !== false) { 208 if ($this->inR2($position)) { 209 $this->word = UTF8::substr($this->word, 0, $position); 210 211 if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) { 212 $letter = UTF8::substr($this->word, -3, 1); 213 if ($letter !== 'e') { 214 $this->word = UTF8::substr($this->word, 0, $position2); 215 } 216 } else { 217 $this->unDoubling(); 218 } 219 } 220 221 222 return true; 223 } 224 225 // ig 226 // delete if in R2 and not preceded by e 227 if ( ($position = $this->search(array('ig'))) !== false) { 228 if ($this->inR2($position)) { 229 $letter = UTF8::substr($this->word, -3, 1); 230 if ($letter !== 'e') { 231 $this->word = UTF8::substr($this->word, 0, $position); 232 } 233 } 234 return true; 235 } 236 237 // lijk 238 // delete if in R2, and then repeat step 2 239 if ( ($position = $this->search(array('lijk'))) !== false) { 240 if ($this->inR2($position)) { 241 $this->word = UTF8::substr($this->word, 0, $position); 242 $this->step2(); 243 } 244 return true; 245 } 246 247 // baar 248 // delete if in R2 249 if ( ($position = $this->search(array('baar'))) !== false) { 250 if ($this->inR2($position)) { 251 $this->word = UTF8::substr($this->word, 0, $position); 252 } 253 return true; 254 } 255 256 // bar 257 // delete if in R2 and if step 2 actually removed an e 258 if ( ($position = $this->search(array('bar'))) !== false) { 259 if ($this->inR2($position) && $removedE) { 260 $this->word = UTF8::substr($this->word, 0, $position); 261 } 262 return true; 263 } 264 265 return false; 266 } 267 268 /** 269 * Step 4: undouble vowel 270 * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, 271 * remove one of the vowels from V (for example, maan -> man, brood -> brod). 272 */ 273 private function step4() 274 { 275 // D is a non-vowel other than I 276 $d = UTF8::substr($this->word, -1, 1); 277 if (in_array($d, array_merge(self::$vowels, array('I')))) { 278 return false; 279 } 280 281 // V is double a, e, o or u 282 $v = UTF8::substr($this->word, -3, 2); 283 if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { 284 return false; 285 } 286 $singleV = UTF8::substr($v, 0, 1); 287 288 // C is a non-vowel 289 $c = UTF8::substr($this->word, -4, 1); 290 if (in_array($c, self::$vowels)) { 291 return false; 292 } 293 294 $this->word = UTF8::substr($this->word, 0, -4); 295 $this->word .= $c . $singleV .$d; 296 } 297 298 /** 299 * Finally 300 * Turn I and Y back into lower case. 301 */ 302 private function finish() 303 { 304 $this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word); 305 } 306 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |