[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace Wamania\Snowball\Stemmer; 4 5 use voku\helper\UTF8; 6 7 /** 8 * English Porter 2 9 * 10 * @link http://snowball.tartarus.org/algorithms/english/stemmer.html 11 * @author wamania 12 * 13 */ 14 class English extends Stem 15 { 16 /** 17 * All english vowels 18 */ 19 protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y'); 20 21 protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'); 22 23 protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'); 24 25 /** 26 * {@inheritdoc} 27 */ 28 public function stem($word) 29 { 30 // we do ALL in UTF-8 31 if (!UTF8::is_utf8($word)) { 32 throw new \Exception('Word must be in UTF-8'); 33 } 34 35 if (Utf8::strlen($word) < 3) { 36 return $word; 37 } 38 39 $this->word = UTF8::strtolower($word); 40 41 // exceptions 42 if (null !== ($word = $this->exception1())) { 43 return $word; 44 } 45 46 47 $this->plainVowels = implode('', self::$vowels); 48 49 // Remove initial ', if present. 50 $first = UTF8::substr($this->word, 0, 1); 51 if ($first == "'") { 52 $this->word = UTF8::substr($this->word, 1); 53 } 54 55 // Set initial y, or y after a vowel, to Y 56 if ($first == 'y') { 57 $this->word = preg_replace('#^y#u', 'Y', $this->word); 58 } 59 $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); 60 61 $this->r1(); 62 $this->exceptionR1(); 63 $this->r2(); 64 65 $this->step0(); 66 $this->step1a(); 67 68 // exceptions 2 69 if (null !== ($word = $this->exception2())) { 70 return $word; 71 } 72 73 $this->step1b(); 74 $this->step1c(); 75 $this->step2(); 76 $this->step3(); 77 $this->step4(); 78 $this->step5(); 79 $this->finish(); 80 81 return $this->word; 82 } 83 84 /** 85 * Step 0 86 * Remove ', 's, 's' 87 */ 88 private function step0() 89 { 90 if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) { 91 $this->word = UTF8::substr($this->word, 0, $position); 92 } 93 } 94 95 private function step1a() 96 { 97 // sses 98 // replace by ss 99 if ( ($position = $this->search(array('sses'))) !== false) { 100 $this->word = preg_replace('#(sses)$#u', 'ss', $this->word); 101 return true; 102 } 103 104 // ied+ ies* 105 // replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri) 106 if ( ($position = $this->search(array('ied', 'ies'))) !== false) { 107 if ($position > 1) { 108 $this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word); 109 110 } else { 111 $this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word); 112 } 113 return true; 114 } 115 116 // us+ ss 117 // do nothing 118 if ( ($position = $this->search(array('us', 'ss'))) !== false) { 119 return true; 120 } 121 122 // s 123 // delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it) 124 if ( ($position = $this->search(array('s'))) !== false) { 125 for ($i=0; $i<$position-1; $i++) { 126 $letter = UTF8::substr($this->word, $i, 1); 127 128 if (in_array($letter, self::$vowels)) { 129 $this->word = UTF8::substr($this->word, 0, $position); 130 return true; 131 } 132 } 133 return true; 134 } 135 136 return false; 137 } 138 139 /** 140 * Step 1b 141 */ 142 private function step1b() 143 { 144 // eed eedly+ 145 // replace by ee if in R1 146 if ( ($position = $this->search(array('eedly', 'eed'))) !== false) { 147 if ($this->inR1($position)) { 148 $this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word); 149 } 150 return true; 151 } 152 153 // ed edly+ ing ingly+ 154 // delete if the preceding word part contains a vowel, and after the deletion: 155 // if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or 156 // if the word ends with a double remove the last letter (so hopp -> hop), or 157 // if the word is short, add e (so hop -> hope) 158 if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) { 159 for ($i=0; $i<$position; $i++) { 160 $letter = UTF8::substr($this->word, $i, 1); 161 162 if (in_array($letter, self::$vowels)) { 163 $this->word = UTF8::substr($this->word, 0, $position); 164 165 if ($this->search(array('at', 'bl', 'iz')) !== false) { 166 $this->word .= 'e'; 167 168 } elseif ( ($position2 = $this->search(self::$doubles)) !== false) { 169 $this->word = UTF8::substr($this->word, 0, ($position2+1)); 170 171 } elseif ($this->isShort()) { 172 $this->word .= 'e'; 173 } 174 175 return true; 176 } 177 } 178 return true; 179 } 180 181 return false; 182 } 183 184 /** 185 * Step 1c: * 186 */ 187 private function step1c() 188 { 189 // replace suffix y or Y by i if preceded by a non-vowel 190 // which is not the first letter of the word (so cry -> cri, by -> by, say -> say) 191 $length = UTF8::strlen($this->word); 192 193 if ($length < 3) { 194 return true; 195 } 196 197 if ( ($position = $this->search(array('y', 'Y'))) !== false) { 198 $before = $position - 1; 199 $letter = UTF8::substr($this->word, $before, 1); 200 201 if (! in_array($letter, self::$vowels)) { 202 $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word); 203 } 204 205 return true; 206 } 207 208 return false; 209 } 210 211 /** 212 * Step 2 213 * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated. 214 */ 215 private function step2() 216 { 217 // iveness iviti: replace by ive 218 if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) { 219 if ($this->inR1($position)) { 220 $this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word); 221 } 222 return true; 223 } 224 225 // ousli ousness: replace by ous 226 if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) { 227 if ($this->inR1($position)) { 228 $this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word); 229 } 230 return true; 231 } 232 233 // izer ization: replace by ize 234 if ( ($position = $this->search(array('izer', 'ization'))) !== false) { 235 if ($this->inR1($position)) { 236 $this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word); 237 } 238 return true; 239 } 240 241 // ational ation ator: replace by ate 242 if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) { 243 if ($this->inR1($position)) { 244 $this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word); 245 } 246 return true; 247 } 248 249 // biliti bli+: replace by ble 250 if ( ($position = $this->search(array('biliti', 'bli'))) !== false) { 251 if ($this->inR1($position)) { 252 $this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word); 253 } 254 return true; 255 } 256 257 // lessli+: replace by less 258 if ( ($position = $this->search(array('lessli'))) !== false) { 259 if ($this->inR1($position)) { 260 $this->word = preg_replace('#(lessli)$#u', 'less', $this->word); 261 } 262 return true; 263 } 264 265 // fulness: replace by ful 266 if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) { 267 if ($this->inR1($position)) { 268 $this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word); 269 } 270 return true; 271 } 272 273 // tional: replace by tion 274 if ( ($position = $this->search(array('tional'))) !== false) { 275 if ($this->inR1($position)) { 276 $this->word = preg_replace('#(tional)$#u', 'tion', $this->word); 277 } 278 return true; 279 } 280 281 // alism aliti alli: replace by al 282 if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) { 283 if ($this->inR1($position)) { 284 $this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word); 285 } 286 return true; 287 } 288 289 // enci: replace by ence 290 if ( ($position = $this->search(array('enci'))) !== false) { 291 if ($this->inR1($position)) { 292 $this->word = preg_replace('#(enci)$#u', 'ence', $this->word); 293 } 294 return true; 295 } 296 297 // anci: replace by ance 298 if ( ($position = $this->search(array('anci'))) !== false) { 299 if ($this->inR1($position)) { 300 $this->word = preg_replace('#(anci)$#u', 'ance', $this->word); 301 } 302 return true; 303 } 304 305 // abli: replace by able 306 if ( ($position = $this->search(array('abli'))) !== false) { 307 if ($this->inR1($position)) { 308 $this->word = preg_replace('#(abli)$#u', 'able', $this->word); 309 } 310 return true; 311 } 312 313 // entli: replace by ent 314 if ( ($position = $this->search(array('entli'))) !== false) { 315 if ($this->inR1($position)) { 316 $this->word = preg_replace('#(entli)$#u', 'ent', $this->word); 317 } 318 return true; 319 } 320 321 // ogi+: replace by og if preceded by l 322 if ( ($position = $this->search(array('ogi'))) !== false) { 323 324 if ($this->inR1($position)) { 325 $before = $position - 1; 326 $letter = UTF8::substr($this->word, $before, 1); 327 328 if ($letter == 'l') { 329 $this->word = preg_replace('#(ogi)$#u', 'og', $this->word); 330 } 331 } 332 333 return true; 334 } 335 336 // li+: delete if preceded by a valid li-ending 337 if ( ($position = $this->search(array('li'))) !== false) { 338 339 if ($this->inR1($position)) { 340 // a letter for you 341 $letter = UTF8::substr($this->word, ($position-1), 1); 342 343 if (in_array($letter, self::$liEnding)) { 344 $this->word = UTF8::substr($this->word, 0, $position); 345 } 346 } 347 348 return true; 349 } 350 351 return false; 352 } 353 354 /** 355 * Step 3: 356 * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated. 357 */ 358 private function step3() 359 { 360 // ational+: replace by ate 361 if ($this->searchIfInR1(array('ational')) !== false) { 362 $this->word = preg_replace('#(ational)$#u', 'ate', $this->word); 363 return true; 364 } 365 366 // tional+: replace by tion 367 if ($this->searchIfInR1(array('tional')) !== false) { 368 $this->word = preg_replace('#(tional)$#u', 'tion', $this->word); 369 return true; 370 } 371 372 // alize: replace by al 373 if ($this->searchIfInR1(array('alize')) !== false) { 374 $this->word = preg_replace('#(alize)$#u', 'al', $this->word); 375 return true; 376 } 377 378 // icate iciti ical: replace by ic 379 if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) { 380 $this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word); 381 return true; 382 } 383 384 // ful ness: delete 385 if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) { 386 $this->word = UTF8::substr($this->word, 0, $position); 387 return true; 388 } 389 390 // ative*: delete if in R2 391 if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) { 392 $this->word = UTF8::substr($this->word, 0, $position); 393 return true; 394 } 395 396 return false; 397 } 398 399 /** 400 * Step 4 401 * Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated. 402 */ 403 private function step4() 404 { 405 // ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic 406 // delete 407 if ( ($position = $this->search(array( 408 'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism', 409 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) { 410 411 if ($this->inR2($position)) { 412 $this->word = UTF8::substr($this->word, 0, $position); 413 } 414 return true; 415 } 416 417 // ion 418 // delete if preceded by s or t 419 if ( ($position = $this->searchIfInR2(array('ion'))) !== false) { 420 $before = $position - 1; 421 $letter = UTF8::substr($this->word, $before, 1); 422 423 if ($letter == 's' || $letter == 't') { 424 $this->word = UTF8::substr($this->word, 0, $position); 425 } 426 427 return true; 428 } 429 430 return false; 431 } 432 433 /** 434 * Step 5: * 435 * Search for the the following suffixes, and, if found, perform the action indicated. 436 */ 437 private function step5() 438 { 439 // e 440 // delete if in R2, or in R1 and not preceded by a short syllable 441 if ( ($position = $this->search(array('e'))) !== false) { 442 if ($this->inR2($position)) { 443 $this->word = UTF8::substr($this->word, 0, $position); 444 445 } elseif ($this->inR1($position)) { 446 if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) { 447 $this->word = UTF8::substr($this->word, 0, $position); 448 } 449 } 450 451 return true; 452 } 453 454 // l 455 // delete if in R2 and preceded by l 456 if ( ($position = $this->searchIfInR2(array('l'))) !== false) { 457 $before = $position - 1; 458 $letter = UTF8::substr($this->word, $before, 1); 459 460 if ($letter == 'l') { 461 $this->word = UTF8::substr($this->word, 0, $position); 462 } 463 464 return true; 465 } 466 467 return false; 468 } 469 470 private function finish() 471 { 472 $this->word = UTF8::str_replace('Y', 'y', $this->word); 473 } 474 475 private function exceptionR1() 476 { 477 if (Utf8::strpos($this->word, 'gener') === 0) { 478 $this->r1 = UTF8::substr($this->word, 5); 479 $this->r1Index = 5; 480 481 } elseif (Utf8::strpos($this->word, 'commun') === 0) { 482 $this->r1 = UTF8::substr($this->word, 6); 483 $this->r1Index = 6; 484 485 } elseif (Utf8::strpos($this->word, 'arsen') === 0) { 486 $this->r1 = UTF8::substr($this->word, 5); 487 $this->r1Index = 5; 488 } 489 } 490 491 /** 492 * 1/ Stem certain special words as follows, 493 * 2/ If one of the following is found, leave it invariant, 494 */ 495 private function exception1() 496 { 497 $exceptions = array( 498 'skis' => 'ski', 499 'skies' => 'sky', 500 'dying' => 'die', 501 'lying' => 'lie', 502 'tying' => 'tie', 503 'idly' => 'idl', 504 'gently' => 'gentl', 505 'ugly' => 'ugli', 506 'early' => 'earli', 507 'only' => 'onli', 508 'singly' => 'singl', 509 // invariants 510 'sky' => 'sky', 511 'news' => 'news', 512 'howe' => 'howe', 513 'atlas' => 'atlas', 514 'cosmos' => 'cosmos', 515 'bias' => 'bias', 516 'andes' => 'andes' 517 ); 518 519 if (isset($exceptions[$this->word])) { 520 return $exceptions[$this->word]; 521 } 522 523 return null; 524 } 525 526 /** 527 * Following step 1a, leave the following invariant, 528 */ 529 private function exception2() 530 { 531 $exceptions = array( 532 'inning' => 'inning', 533 'outing' => 'outing', 534 'canning' => 'canning', 535 'herring' => 'herring', 536 'earring' => 'earring', 537 'proceed' => 'proceed', 538 'exceed' => 'exceed', 539 'succeed' => 'succeed' 540 ); 541 542 if (isset($exceptions[$this->word])) { 543 return $exceptions[$this->word]; 544 } 545 546 return null; 547 } 548 549 /** 550 * A word is called short if it ends in a short syllable, and if R1 is null. 551 * Note : R1 not really null, but the word at this state must be smaller than r1 index 552 * 553 * @return boolean 554 */ 555 private function isShort() 556 { 557 $length = UTF8::strlen($this->word); 558 return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) ); 559 } 560 561 /** 562 * Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, 563 * or * (b) a vowel at the beginning of the word followed by a non-vowel. 564 * 565 * So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables. 566 * But uproot, bestow, disturb do not end with a short syllable. 567 */ 568 private function searchShortSyllabe($from, $nbLetters) 569 { 570 $length = UTF8::strlen($this->word); 571 572 if ($from < 0) { 573 $from = $length + $from; 574 } 575 if ($from < 0) { 576 $from = 0; 577 } 578 579 // (a) is just for beginning of the word 580 if ( ($nbLetters == 2) && ($from != 0) ) { 581 return false; 582 } 583 584 $first = UTF8::substr($this->word, $from, 1); 585 $second = UTF8::substr($this->word, ($from+1), 1); 586 587 if ($nbLetters == 2) { 588 if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) { 589 return true; 590 } 591 } 592 593 $third = UTF8::substr($this->word, ($from+2), 1); 594 595 if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) 596 && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) { 597 return true; 598 } 599 600 return false; 601 } 602 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |