[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace Wamania\Snowball\Stemmer; 4 5 use voku\helper\UTF8; 6 7 /** 8 * 9 * @link http://snowball.tartarus.org/algorithms/french/stemmer.html 10 * @author wamania 11 * 12 */ 13 class French extends Stem 14 { 15 /** 16 * All french vowels 17 */ 18 protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'â', 'à', 'ë', 'é', 'ê', 'è', 'ï', 'î', 'ô', 'û', 'ù'); 19 20 /** 21 * {@inheritdoc} 22 */ 23 public function stem($word) 24 { 25 // we do ALL in UTF-8 26 if (!UTF8::is_utf8($word)) { 27 throw new \Exception('Word must be in UTF-8'); 28 } 29 30 $this->word = UTF8::strtolower($word); 31 32 $this->plainVowels = implode('', self::$vowels); 33 34 $this->step0(); 35 36 $this->rv(); 37 $this->r1(); 38 $this->r2(); 39 40 // to know if step1, 2a or 2b have altered the word 41 $this->originalWord = $this->word; 42 43 $nextStep = $this->step1(); 44 45 // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found. 46 if ( ($nextStep == 2) || ($this->originalWord == $this->word) ) { 47 $modified = $this->step2a(); 48 if (!$modified) { 49 $this->step2b(); 50 } 51 } 52 53 if ($this->word != $this->originalWord) { 54 $this->step3(); 55 56 } else { 57 $this->step4(); 58 } 59 60 $this->step5(); 61 $this->step6(); 62 $this->finish(); 63 64 return $this->word; 65 } 66 67 68 69 /** 70 * Assume the word is in lower case. 71 * Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel. 72 * u after q is also put into upper case. For example, 73 * jouer -> joUer 74 * ennuie -> ennuIe 75 * yeux -> Yeux 76 * quand -> qUand 77 */ 78 private function step0() 79 { 80 $this->word = preg_replace('#([q])u#u', '$1U', $this->word); 81 $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); 82 $this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word); 83 $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); 84 $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); 85 } 86 87 /** 88 * Step 1 89 * Search for the longest among the following suffixes, and perform the action indicated. 90 * 91 * @return integer Next step number 92 */ 93 private function step1() 94 { 95 // ance iqUe isme able iste eux ances iqUes ismes ables istes 96 // delete if in R2 97 if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) { 98 if ($this->inR2($position)) { 99 $this->word = UTF8::substr($this->word, 0, $position); 100 } 101 return 3; 102 } 103 104 // atrice ateur ation atrices ateurs ations 105 // delete if in R2 106 // if preceded by ic, delete if in R2, else replace by iqU 107 if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) { 108 if ($this->inR2($position)) { 109 $this->word = UTF8::substr($this->word, 0, $position); 110 111 if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) { 112 $this->word = UTF8::substr($this->word, 0, $position2); 113 } else { 114 $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); 115 } 116 } 117 118 return 3; 119 } 120 121 // logie logies 122 // replace with log if in R2 123 if ( ($position = $this->search(array('logies', 'logie'))) !== false) { 124 if ($this->inR2($position)) { 125 $this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word); 126 } 127 return 3; 128 } 129 130 // usion ution usions utions 131 // replace with u if in R2 132 if ( ($position = $this->search(array('usions', 'utions', 'usion', 'ution'))) !== false) { 133 if ($this->inR2($position)) { 134 $this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word); 135 } 136 return 3; 137 } 138 139 // ence ences 140 // replace with ent if in R2 141 if ( ($position = $this->search(array('ences', 'ence'))) !== false) { 142 if ($this->inR2($position)) { 143 $this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word); 144 } 145 return 3; 146 } 147 148 // issement issements 149 // delete if in R1 and preceded by a non-vowel 150 if ( ($position = $this->search(array('issements', 'issement'))) != false) { 151 if ($this->inR1($position)) { 152 $before = $position - 1; 153 $letter = UTF8::substr($this->word, $before, 1); 154 if (! in_array($letter, self::$vowels)) { 155 $this->word = UTF8::substr($this->word, 0, $position); 156 } 157 } 158 return 3; 159 } 160 161 // ement ements 162 // delete if in RV 163 // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 164 // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, 165 // if preceded by abl or iqU, delete if in R2, otherwise, 166 // if preceded by ièr or Ièr, replace by i if in RV 167 if ( ($position = $this->search(array('ements', 'ement'))) !== false) { 168 169 // delete if in RV 170 if ($this->inRv($position)) { 171 $this->word = UTF8::substr($this->word, 0, $position); 172 } 173 174 // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 175 if ( ($position = $this->searchIfInR2(array('iv'))) !== false) { 176 $this->word = UTF8::substr($this->word, 0, $position); 177 if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { 178 $this->word = UTF8::substr($this->word, 0, $position2); 179 } 180 181 // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, 182 } elseif ( ($position = $this->search(array('eus'))) !== false) { 183 if ($this->inR2($position)) { 184 $this->word = UTF8::substr($this->word, 0, $position); 185 186 } elseif ($this->inR1($position)) { 187 $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); 188 } 189 190 // if preceded by abl or iqU, delete if in R2, otherwise, 191 } elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) { 192 $this->word = UTF8::substr($this->word, 0, $position); 193 194 // if preceded by ièr or Ièr, replace by i if in RV 195 } elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) { 196 $this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word); 197 } 198 return 3; 199 } 200 201 // ité ités 202 // delete if in R2 203 // if preceded by abil, delete if in R2, else replace by abl, otherwise, 204 // if preceded by ic, delete if in R2, else replace by iqU, otherwise, 205 // if preceded by iv, delete if in R2 206 if ( ($position = $this->search(array('ités', 'ité'))) !== false) { 207 208 // delete if in R2 209 if ($this->inR2($position)) { 210 $this->word = UTF8::substr($this->word, 0, $position); 211 } 212 213 // if preceded by abil, delete if in R2, else replace by abl, otherwise, 214 if ( ($position = $this->search(array('abil'))) !== false) { 215 if ($this->inR2($position)) { 216 $this->word = UTF8::substr($this->word, 0, $position); 217 } else { 218 $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); 219 } 220 221 // if preceded by ic, delete if in R2, else replace by iqU, otherwise, 222 } elseif ( ($position = $this->search(array('ic'))) !== false) { 223 if ($this->inR2($position)) { 224 $this->word = UTF8::substr($this->word, 0, $position); 225 } else { 226 $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); 227 } 228 229 // if preceded by iv, delete if in R2 230 } elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) { 231 $this->word = UTF8::substr($this->word, 0, $position); 232 } 233 234 return 3; 235 } 236 237 // if ive ifs ives 238 // delete if in R2 239 // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU) 240 if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) { 241 242 if ($this->inR2($position)) { 243 $this->word = UTF8::substr($this->word, 0, $position); 244 } 245 246 if ( ($position = $this->searchIfInR2(array('at'))) !== false) { 247 $this->word = UTF8::substr($this->word, 0, $position); 248 249 if ( ($position2 = $this->search(array('ic'))) !== false) { 250 if ($this->inR2($position2)) { 251 $this->word = UTF8::substr($this->word, 0, $position2); 252 } else { 253 $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); 254 } 255 } 256 } 257 258 return 3; 259 } 260 261 // eaux 262 // replace with eau 263 if ( ($position = $this->search(array('eaux'))) !== false) { 264 $this->word = preg_replace('#(eaux)$#u', 'eau', $this->word); 265 return 3; 266 } 267 268 // aux 269 // replace with al if in R1 270 if ( ($position = $this->search(array('aux'))) !== false) { 271 if ($this->inR1($position)) { 272 $this->word = preg_replace('#(aux)$#u', 'al', $this->word); 273 } 274 return 3; 275 } 276 277 // euse euses 278 // delete if in R2, else replace by eux if in R1 279 if ( ($position = $this->search(array('euses', 'euse'))) !== false) { 280 if ($this->inR2($position)) { 281 $this->word = UTF8::substr($this->word, 0, $position); 282 283 } elseif ($this->inR1($position)) { 284 $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); 285 //return 3; 286 } 287 return 3; 288 } 289 290 // amment 291 // replace with ant if in RV 292 if ( ($position = $this->search(array('amment'))) !== false) { 293 if ($this->inRv($position)) { 294 $this->word = preg_replace('#(amment)$#u', 'ant', $this->word); 295 } 296 return 2; 297 } 298 299 // emment 300 // replace with ent if in RV 301 if ( ($position = $this->search(array('emment'))) !== false) { 302 if ($this->inRv($position)) { 303 $this->word = preg_replace('#(emment)$#u', 'ent', $this->word); 304 } 305 return 2; 306 } 307 308 // ment ments 309 // delete if preceded by a vowel in RV 310 if ( ($position = $this->search(array('ments', 'ment'))) != false) { 311 $before = $position - 1; 312 $letter = UTF8::substr($this->word, $before, 1); 313 if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) { 314 $this->word = UTF8::substr($this->word, 0, $position); 315 } 316 317 return 2; 318 } 319 320 return 2; 321 } 322 323 /** 324 * Step 2a: Verb suffixes beginning i 325 * In steps 2a and 2b all tests are confined to the RV region. 326 * Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel. 327 * îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez 328 * irions irons iront is issaIent issais issait issant issante issantes issants isse 329 * issent isses issez issiez issions issons it 330 * (Note that the non-vowel itself must also be in RV.) 331 */ 332 private function step2a() 333 { 334 if ( ($position = $this->searchIfInRv(array( 335 'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez', 336 'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants', 337 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) { 338 339 $before = $position - 1; 340 $letter = UTF8::substr($this->word, $before, 1); 341 if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) { 342 $this->word = UTF8::substr($this->word, 0, $position); 343 344 return true; 345 } 346 } 347 348 return false; 349 } 350 351 /** 352 * Do step 2b if step 2a was done, but failed to remove a suffix. 353 * Step 2b: Other verb suffixes 354 */ 355 private function step2b() 356 { 357 // é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez 358 // delete 359 if ( ($position = $this->searchIfInRv(array( 360 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', 361 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) { 362 363 $this->word = UTF8::substr($this->word, 0, $position); 364 365 return true; 366 } 367 368 // âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions 369 // delete 370 // if preceded by e, delete 371 if ( ($position = $this->searchIfInRv(array( 372 'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant', 373 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) { 374 375 $before = $position - 1; 376 $letter = UTF8::substr($this->word, $before, 1); 377 if ( $this->inRv($before) && ($letter == 'e') ) { 378 $this->word = UTF8::substr($this->word, 0, $before); 379 380 } else { 381 $this->word = UTF8::substr($this->word, 0, $position); 382 } 383 384 return true; 385 } 386 387 // ions 388 // delete if in R2 389 if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { 390 if ($this->inR2($position)) { 391 $this->word = UTF8::substr($this->word, 0, $position); 392 } 393 394 return true; 395 } 396 397 return false; 398 } 399 400 /** 401 * Step 3: Replace final Y with i or final ç with c 402 */ 403 private function step3() 404 { 405 $this->word = preg_replace('#(Y)$#u', 'i', $this->word); 406 $this->word = preg_replace('#(ç)$#u', 'c', $this->word); 407 } 408 409 /** 410 * Step 4: Residual suffix 411 */ 412 private function step4() 413 { 414 //If the word ends s, not preceded by a, i, o, u, è or s, delete it. 415 if (preg_match('#[^aiouès]s$#', $this->word)) { 416 $this->word = UTF8::substr($this->word, 0, -1); 417 } 418 419 // In the rest of step 4, all tests are confined to the RV region. 420 // ion 421 // delete if in R2 and preceded by s or t 422 if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) { 423 $before = $position - 1; 424 $letter = UTF8::substr($this->word, $before, 1); 425 if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) { 426 $this->word = UTF8::substr($this->word, 0, $position); 427 } 428 return true; 429 } 430 431 // ier ière Ier Ière 432 // replace with i 433 if ( ($this->searchIfInRv(array('ier', 'ière', 'Ier', 'Ière'))) !== false) { 434 $this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word); 435 return true; 436 } 437 438 // e 439 // delete 440 if ( ($this->searchIfInRv(array('e'))) !== false) { 441 $this->word = UTF8::substr($this->word, 0, -1); 442 return true; 443 } 444 445 // ë 446 // if preceded by gu, delete 447 if ( ($position = $this->searchIfInRv(array('guë'))) !== false) { 448 if ($this->inRv($position+2)) { 449 $this->word = UTF8::substr($this->word, 0, -1); 450 return true; 451 } 452 } 453 454 return false; 455 } 456 457 /** 458 * Step 5: Undouble 459 * If the word ends enn, onn, ett, ell or eill, delete the last letter 460 */ 461 private function step5() 462 { 463 if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) { 464 $this->word = UTF8::substr($this->word, 0, -1); 465 } 466 } 467 468 /** 469 * Step 6: Un-accent 470 * If the words ends é or è followed by at least one non-vowel, remove the accent from the e. 471 */ 472 private function step6() 473 { 474 $this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word); 475 } 476 477 /** 478 * And finally: 479 * Turn any remaining I, U and Y letters in the word back into lower case. 480 */ 481 private function finish() 482 { 483 $this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); 484 } 485 486 /** 487 * If the word begins with two vowels, RV is the region after the third letter, 488 * otherwise the region after the first vowel not at the beginning of the word, 489 * or the end of the word if these positions cannot be found. 490 * (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) 491 */ 492 protected function rv() 493 { 494 $length = UTF8::strlen($this->word); 495 496 $this->rv = ''; 497 $this->rvIndex = $length; 498 499 if ($length < 3) { 500 return true; 501 } 502 503 // If the word begins with two vowels, RV is the region after the third letter 504 $first = UTF8::substr($this->word, 0, 1); 505 $second = UTF8::substr($this->word, 1, 1); 506 507 if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) { 508 $this->rv = UTF8::substr($this->word, 3); 509 $this->rvIndex = 3; 510 return true; 511 } 512 513 // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) 514 $begin3 = UTF8::substr($this->word, 0, 3); 515 if (in_array($begin3, array('par', 'col', 'tap'))) { 516 $this->rv = UTF8::substr($this->word, 3); 517 $this->rvIndex = 3; 518 return true; 519 } 520 521 // otherwise the region after the first vowel not at the beginning of the word, 522 for ($i=1; $i<$length; $i++) { 523 $letter = UTF8::substr($this->word, $i, 1); 524 if (in_array($letter, self::$vowels)) { 525 $this->rv = UTF8::substr($this->word, ($i + 1)); 526 $this->rvIndex = $i + 1; 527 return true; 528 } 529 } 530 531 return false; 532 } 533 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |