Search:

CWIS Developers Documentation

  • Main Page
  • Classes
  • Files
  • File List
  • File Members

PorterStemmer.php

Go to the documentation of this file.
00001 <?php
00025     class PorterStemmer
00026     {
00031         private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
00032 
00033 
00038         private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
00039 
00040 
00046         private static $cache = array();
00047 
00048 
00056         public static function Stem($word, $cache = false)
00057         {
00058             if (strlen($word) <= 2) {
00059                 return $word;
00060             }
00061 
00062             // Check cache
00063             if ($cache AND !empty(self::$cache[$word])) {
00064                 return self::$cache[$word];
00065             }
00066             
00070             $word = preg_replace("/('ve|n't|'d)$/", '', $word);
00071 
00072             $stem = self::step1ab($word);
00073             $stem = self::step1c($stem);
00074             $stem = self::step2($stem);
00075             $stem = self::step3($stem);
00076             $stem = self::step4($stem);
00077             $stem = self::step5($stem);
00078 
00079             // Store in cache
00080             if ($cache) {
00081                 self::$cache[$word] = $stem;
00082             }
00083 
00084             return $stem;
00085         }
00086 
00087 
00091         private static function step1ab($word)
00092         {
00093             // Part a
00094             if (substr($word, -1) == 's') {
00095 
00096                    self::replace($word, 'sses', 'ss')
00097                 OR self::replace($word, 'ies', 'i')
00098                 OR self::replace($word, 'ss', 'ss')
00099                 OR self::replace($word, 's', '');
00100             }
00101 
00102             // Part b
00103             if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
00104                 $v = self::$regex_vowel;
00105 
00106                 // ing and ed
00107                 if (   preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
00108                     OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
00109 
00110                     // If one of above two test successful
00111                     if (    !self::replace($word, 'at', 'ate')
00112                         AND !self::replace($word, 'bl', 'ble')
00113                         AND !self::replace($word, 'iz', 'ize')) {
00114 
00115                         // Double consonant ending
00116                         if (    self::doubleConsonant($word)
00117                             AND substr($word, -2) != 'll'
00118                             AND substr($word, -2) != 'ss'
00119                             AND substr($word, -2) != 'zz') {
00120                             
00121                             $word = substr($word, 0, -1);
00122                         
00123                         } else if (self::m($word) == 1 AND self::cvc($word)) {
00124                             $word .= 'e';
00125                         }
00126                     }
00127                 }
00128             }
00129 
00130             return $word;
00131         }
00132 
00133 
00139         private static function step1c($word)
00140         {
00141             $v = self::$regex_vowel;
00142 
00143             if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
00144                 self::replace($word, 'y', 'i');
00145             }
00146 
00147             return $word;
00148         }
00149 
00150 
00156         private static function step2($word)
00157         {
00158             switch (substr($word, -2, 1)) {
00159                 case 'a':
00160                        self::replace($word, 'ational', 'ate', 0)
00161                     OR self::replace($word, 'tional', 'tion', 0);
00162                     break;
00163 
00164                 case 'c':
00165                        self::replace($word, 'enci', 'ence', 0)
00166                     OR self::replace($word, 'anci', 'ance', 0);
00167                     break;
00168 
00169                 case 'e':
00170                     self::replace($word, 'izer', 'ize', 0);
00171                     break;
00172 
00173                 case 'g':
00174                     self::replace($word, 'logi', 'log', 0);
00175                     break;
00176 
00177                 case 'l':
00178                        self::replace($word, 'entli', 'ent', 0)
00179                     OR self::replace($word, 'ousli', 'ous', 0)
00180                     OR self::replace($word, 'alli', 'al', 0)
00181                     OR self::replace($word, 'bli', 'ble', 0)
00182                     OR self::replace($word, 'eli', 'e', 0);
00183                     break;
00184 
00185                 case 'o':
00186                        self::replace($word, 'ization', 'ize', 0)
00187                     OR self::replace($word, 'ation', 'ate', 0)
00188                     OR self::replace($word, 'ator', 'ate', 0);
00189                     break;
00190 
00191                 case 's':
00192                        self::replace($word, 'iveness', 'ive', 0)
00193                     OR self::replace($word, 'fulness', 'ful', 0)
00194                     OR self::replace($word, 'ousness', 'ous', 0)
00195                     OR self::replace($word, 'alism', 'al', 0);
00196                     break;
00197 
00198                 case 't':
00199                        self::replace($word, 'biliti', 'ble', 0)
00200                     OR self::replace($word, 'aliti', 'al', 0)
00201                     OR self::replace($word, 'iviti', 'ive', 0);
00202                     break;
00203             }
00204 
00205             return $word;
00206         }
00207 
00208 
00214         private static function step3($word)
00215         {
00216             switch (substr($word, -2, 1)) {
00217                 case 'a':
00218                     self::replace($word, 'ical', 'ic', 0);
00219                     break;
00220                     
00221                 case 's':
00222                        self::replace($word, 'alise', 'al', 0)
00223                     OR self::replace($word, 'ness', '', 0);
00224                     break;
00225                     
00226                 case 't':
00227                        self::replace($word, 'icate', 'ic', 0)
00228                     OR self::replace($word, 'iciti', 'ic', 0);
00229                     break;
00230                     
00231                 case 'u':
00232                     self::replace($word, 'ful', '', 0);
00233                     break;
00234                     
00235                 case 'v':
00236                     self::replace($word, 'ative', '', 0);
00237                     break;
00238                     
00239                 case 'z':
00240                     self::replace($word, 'alize', 'al', 0);
00241                     break;
00242             }
00243             
00244             return $word;
00245         }
00246 
00247 
00253         private static function step4($word)
00254         {
00255             switch (substr($word, -2, 1)) {
00256                 case 'a':
00257                     self::replace($word, 'al', '', 1);
00258                     break;
00259 
00260                 case 'c':
00261                        self::replace($word, 'ance', '', 1)
00262                     OR self::replace($word, 'ence', '', 1);
00263                     break;
00264 
00265                 case 'e':
00266                     self::replace($word, 'er', '', 1);
00267                     break;
00268 
00269                 case 'i':
00270                     self::replace($word, 'ic', '', 1);
00271                     break;
00272 
00273                 case 'l':
00274                        self::replace($word, 'able', '', 1)
00275                     OR self::replace($word, 'ible', '', 1);
00276                     break;
00277 
00278                 case 'n':
00279                        self::replace($word, 'ant', '', 1)
00280                     OR self::replace($word, 'ement', '', 1)
00281                     OR self::replace($word, 'ment', '', 1)
00282                     OR self::replace($word, 'ent', '', 1);
00283                     break;
00284 
00285                 case 'o':
00286                     if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
00287                        self::replace($word, 'ion', '', 1);
00288                     } else {
00289                         self::replace($word, 'ou', '', 1);
00290                     }
00291                     break;
00292 
00293                 case 's':
00294                     self::replace($word, 'ism', '', 1);
00295                     break;
00296 
00297                 case 't':
00298                        self::replace($word, 'ate', '', 1)
00299                     OR self::replace($word, 'iti', '', 1);
00300                     break;
00301 
00302                 case 'u':
00303                     self::replace($word, 'ous', '', 1);
00304                     break;
00305 
00306                 case 'v':
00307                     self::replace($word, 'ive', '', 1);
00308                     break;
00309 
00310                 case 'z':
00311                     self::replace($word, 'ize', '', 1);
00312                     break;
00313             }
00314             
00315             return $word;
00316         }
00317 
00318 
00324         private static function step5($word)
00325         {
00326             // Part a
00327             if (substr($word, -1) == 'e') {
00328                 if (self::m(substr($word, 0, -1)) > 1) {
00329                     self::replace($word, 'e', '');
00330 
00331                 } else if (self::m(substr($word, 0, -1)) == 1) {
00332 
00333                     if (!self::cvc(substr($word, 0, -1))) {
00334                         self::replace($word, 'e', '');
00335                     }
00336                 }
00337             }
00338 
00339             // Part b
00340             if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
00341                 $word = substr($word, 0, -1);
00342             }
00343 
00344             return $word;
00345         }
00346 
00347 
00360         private static function replace(&$str, $check, $repl, $m = null)
00361         {
00362             $len = 0 - strlen($check);
00363 
00364             if (substr($str, $len) == $check) {
00365                 $substr = substr($str, 0, $len);
00366                 if (is_null($m) OR self::m($substr) > $m) {
00367                     $str = $substr . $repl;
00368                 }
00369 
00370                 return true;
00371             }
00372 
00373             return false;
00374         }
00375 
00376 
00392         private static function m($str)
00393         {
00394             $c = self::$regex_consonant;
00395             $v = self::$regex_vowel;
00396 
00397             $str = preg_replace("#^$c+#", '', $str);
00398             $str = preg_replace("#$v+$#", '', $str);
00399 
00400             preg_match_all("#($v+$c+)#", $str, $matches);
00401 
00402             return count($matches[1]);
00403         }
00404 
00405 
00413         private static function doubleConsonant($str)
00414         {
00415             $c = self::$regex_consonant;
00416             
00417             return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
00418         }
00419 
00420 
00427         private static function cvc($str)
00428         {
00429             $c = self::$regex_consonant;
00430             $v = self::$regex_vowel;
00431 
00432             return     preg_match("#($c$v$c)$#", $str, $matches)
00433                    AND strlen($matches[1]) == 3
00434                    AND $matches[1]{2} != 'w'
00435                    AND $matches[1]{2} != 'x'
00436                    AND $matches[1]{2} != 'y';
00437         }
00438     }
00439 ?>

CWIS logo doxygen
Copyright 2010 Internet Scout