PorterStemmer.php
Go to the documentation of this file.
00001 <?php 00025 class PorterStemmer 00026 { 00031 private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)'; 00032 00033 00038 private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)'; 00039 00040 00046 private static $cache = array(); 00047 00048 00056 public static function Stem($word, $cache = false) 00057 { 00058 if (strlen($word) <= 2) { 00059 return $word; 00060 } 00061 00062 // Check cache 00063 if ($cache AND !empty(self::$cache[$word])) { 00064 return self::$cache[$word]; 00065 } 00066 00070 $word = preg_replace("/('ve|n't|'d)$/", '', $word); 00071 00072 $stem = self::step1ab($word); 00073 $stem = self::step1c($stem); 00074 $stem = self::step2($stem); 00075 $stem = self::step3($stem); 00076 $stem = self::step4($stem); 00077 $stem = self::step5($stem); 00078 00079 // Store in cache 00080 if ($cache) { 00081 self::$cache[$word] = $stem; 00082 } 00083 00084 return $stem; 00085 } 00086 00087 00091 private static function step1ab($word) 00092 { 00093 // Part a 00094 if (substr($word, -1) == 's') { 00095 00096 self::replace($word, 'sses', 'ss') 00097 OR self::replace($word, 'ies', 'i') 00098 OR self::replace($word, 'ss', 'ss') 00099 OR self::replace($word, 's', ''); 00100 } 00101 00102 // Part b 00103 if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule 00104 $v = self::$regex_vowel; 00105 00106 // ing and ed 00107 if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '') 00108 OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons 00109 00110 // If one of above two test successful 00111 if ( !self::replace($word, 'at', 'ate') 00112 AND !self::replace($word, 'bl', 'ble') 00113 AND !self::replace($word, 'iz', 'ize')) { 00114 00115 // Double consonant ending 00116 if ( self::doubleConsonant($word) 00117 AND substr($word, -2) != 'll' 00118 AND substr($word, -2) != 'ss' 00119 AND substr($word, -2) != 'zz') { 00120 00121 $word = substr($word, 0, -1); 00122 00123 } else if (self::m($word) == 1 AND self::cvc($word)) { 00124 $word .= 'e'; 00125 } 00126 } 00127 } 00128 } 00129 00130 return $word; 00131 } 00132 00133 00139 private static function step1c($word) 00140 { 00141 $v = self::$regex_vowel; 00142 00143 if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) { 00144 self::replace($word, 'y', 'i'); 00145 } 00146 00147 return $word; 00148 } 00149 00150 00156 private static function step2($word) 00157 { 00158 switch (substr($word, -2, 1)) { 00159 case 'a': 00160 self::replace($word, 'ational', 'ate', 0) 00161 OR self::replace($word, 'tional', 'tion', 0); 00162 break; 00163 00164 case 'c': 00165 self::replace($word, 'enci', 'ence', 0) 00166 OR self::replace($word, 'anci', 'ance', 0); 00167 break; 00168 00169 case 'e': 00170 self::replace($word, 'izer', 'ize', 0); 00171 break; 00172 00173 case 'g': 00174 self::replace($word, 'logi', 'log', 0); 00175 break; 00176 00177 case 'l': 00178 self::replace($word, 'entli', 'ent', 0) 00179 OR self::replace($word, 'ousli', 'ous', 0) 00180 OR self::replace($word, 'alli', 'al', 0) 00181 OR self::replace($word, 'bli', 'ble', 0) 00182 OR self::replace($word, 'eli', 'e', 0); 00183 break; 00184 00185 case 'o': 00186 self::replace($word, 'ization', 'ize', 0) 00187 OR self::replace($word, 'ation', 'ate', 0) 00188 OR self::replace($word, 'ator', 'ate', 0); 00189 break; 00190 00191 case 's': 00192 self::replace($word, 'iveness', 'ive', 0) 00193 OR self::replace($word, 'fulness', 'ful', 0) 00194 OR self::replace($word, 'ousness', 'ous', 0) 00195 OR self::replace($word, 'alism', 'al', 0); 00196 break; 00197 00198 case 't': 00199 self::replace($word, 'biliti', 'ble', 0) 00200 OR self::replace($word, 'aliti', 'al', 0) 00201 OR self::replace($word, 'iviti', 'ive', 0); 00202 break; 00203 } 00204 00205 return $word; 00206 } 00207 00208 00214 private static function step3($word) 00215 { 00216 switch (substr($word, -2, 1)) { 00217 case 'a': 00218 self::replace($word, 'ical', 'ic', 0); 00219 break; 00220 00221 case 's': 00222 self::replace($word, 'alise', 'al', 0) 00223 OR self::replace($word, 'ness', '', 0); 00224 break; 00225 00226 case 't': 00227 self::replace($word, 'icate', 'ic', 0) 00228 OR self::replace($word, 'iciti', 'ic', 0); 00229 break; 00230 00231 case 'u': 00232 self::replace($word, 'ful', '', 0); 00233 break; 00234 00235 case 'v': 00236 self::replace($word, 'ative', '', 0); 00237 break; 00238 00239 case 'z': 00240 self::replace($word, 'alize', 'al', 0); 00241 break; 00242 } 00243 00244 return $word; 00245 } 00246 00247 00253 private static function step4($word) 00254 { 00255 switch (substr($word, -2, 1)) { 00256 case 'a': 00257 self::replace($word, 'al', '', 1); 00258 break; 00259 00260 case 'c': 00261 self::replace($word, 'ance', '', 1) 00262 OR self::replace($word, 'ence', '', 1); 00263 break; 00264 00265 case 'e': 00266 self::replace($word, 'er', '', 1); 00267 break; 00268 00269 case 'i': 00270 self::replace($word, 'ic', '', 1); 00271 break; 00272 00273 case 'l': 00274 self::replace($word, 'able', '', 1) 00275 OR self::replace($word, 'ible', '', 1); 00276 break; 00277 00278 case 'n': 00279 self::replace($word, 'ant', '', 1) 00280 OR self::replace($word, 'ement', '', 1) 00281 OR self::replace($word, 'ment', '', 1) 00282 OR self::replace($word, 'ent', '', 1); 00283 break; 00284 00285 case 'o': 00286 if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') { 00287 self::replace($word, 'ion', '', 1); 00288 } else { 00289 self::replace($word, 'ou', '', 1); 00290 } 00291 break; 00292 00293 case 's': 00294 self::replace($word, 'ism', '', 1); 00295 break; 00296 00297 case 't': 00298 self::replace($word, 'ate', '', 1) 00299 OR self::replace($word, 'iti', '', 1); 00300 break; 00301 00302 case 'u': 00303 self::replace($word, 'ous', '', 1); 00304 break; 00305 00306 case 'v': 00307 self::replace($word, 'ive', '', 1); 00308 break; 00309 00310 case 'z': 00311 self::replace($word, 'ize', '', 1); 00312 break; 00313 } 00314 00315 return $word; 00316 } 00317 00318 00324 private static function step5($word) 00325 { 00326 // Part a 00327 if (substr($word, -1) == 'e') { 00328 if (self::m(substr($word, 0, -1)) > 1) { 00329 self::replace($word, 'e', ''); 00330 00331 } else if (self::m(substr($word, 0, -1)) == 1) { 00332 00333 if (!self::cvc(substr($word, 0, -1))) { 00334 self::replace($word, 'e', ''); 00335 } 00336 } 00337 } 00338 00339 // Part b 00340 if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') { 00341 $word = substr($word, 0, -1); 00342 } 00343 00344 return $word; 00345 } 00346 00347 00360 private static function replace(&$str, $check, $repl, $m = null) 00361 { 00362 $len = 0 - strlen($check); 00363 00364 if (substr($str, $len) == $check) { 00365 $substr = substr($str, 0, $len); 00366 if (is_null($m) OR self::m($substr) > $m) { 00367 $str = $substr . $repl; 00368 } 00369 00370 return true; 00371 } 00372 00373 return false; 00374 } 00375 00376 00392 private static function m($str) 00393 { 00394 $c = self::$regex_consonant; 00395 $v = self::$regex_vowel; 00396 00397 $str = preg_replace("#^$c+#", '', $str); 00398 $str = preg_replace("#$v+$#", '', $str); 00399 00400 preg_match_all("#($v+$c+)#", $str, $matches); 00401 00402 return count($matches[1]); 00403 } 00404 00405 00413 private static function doubleConsonant($str) 00414 { 00415 $c = self::$regex_consonant; 00416 00417 return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1}; 00418 } 00419 00420 00427 private static function cvc($str) 00428 { 00429 $c = self::$regex_consonant; 00430 $v = self::$regex_vowel; 00431 00432 return preg_match("#($c$v$c)$#", $str, $matches) 00433 AND strlen($matches[1]) == 3 00434 AND $matches[1]{2} != 'w' 00435 AND $matches[1]{2} != 'x' 00436 AND $matches[1]{2} != 'y'; 00437 } 00438 } 00439 ?>