#include "Stemmer.h" #include "stringProcessing.h" #include <cassert> #include <string> /** * Stemmer Cstor */ Stemmer::Stemmer ( ) { } /** * Returns the stem of a word * * @param word * @return */ std::string Stemmer::execute ( std::string word ) { if(!word.empty()) { word = step1a( word ); word = step1b( word ); word = step1c( word ); word = step2( word ); word = step3( word ); word = step4( word ); word = step5a( word ); word = step5b( word ); } return word; } /** * Number of consonant sequences * * <c><v> -> 0 * <c>vc<v> -> 1 * <c>vcvc<v> -> 2 * <c>vcvcvc<v> -> 3 * * @param word * @return */ int Stemmer::measure ( std::string word ) { // measure int m = 0; unsigned long wordIt = 0; unsigned long wordEnd = word.size( ) - 1; // Looking for CVC pattern while ( wordIt <= wordEnd ) { if ( !isConsonant( wordIt, word ) ) { break; } wordIt += 1; } if ( wordIt > wordEnd ) { return m; } wordIt += 1; while ( wordIt <= wordEnd ) { while ( wordIt <= wordEnd ) { if ( isConsonant( wordIt, word ) ) { break; } wordIt += 1; } if ( wordIt > wordEnd ) { return m; } wordIt += 1; m += 1; while ( wordIt <= wordEnd ) { if ( !isConsonant( wordIt, word ) ) { break; } wordIt += 1; } if ( wordIt > wordEnd ) { return m; } wordIt += 1; } return m; } /** * Check if a vowel is present in the stem * * @param wordBeg * @param wordEnd * @param word * @return */ bool Stemmer::isVowelPresent ( unsigned long wordBeg, unsigned long wordEnd, string word ) { while ( wordBeg != wordEnd && wordBeg < word.size( ) ) { if ( !isConsonant( wordBeg, word ) ) { return true; } ++wordBeg; } return false; } /** * Return true if the wordIt points to a consonant * * * @param wordIt * @param word * @return */ bool Stemmer::isConsonant ( unsigned long wordIt, string word ) { if ( wordIt >= word.size( ) ) return false; char wordChar = word[ wordIt ]; if ( wordChar == 'a' || wordChar == 'e' || wordChar == 'i' || wordChar == 'o' || wordChar == 'u' ) { return false; } if ( wordChar == 'y' ) { if ( wordIt == 0 ) { return true; } else { return ( !isConsonant( wordIt - 1, word ) ); } } return true; } /** * Returns true if should add 'e' to end * * @param word * @return */ bool Stemmer::addE ( string word ) { // AT -> ATE // BL -> BLE // IZ -> IZE string substr = lastN( word, 2 ); if ( substr == "at" || substr == "bl" || substr == "iz" ) { return true; } else { return false; } } /** * Returns true if word ends in double constant * Not LL, SS, ZZ * @param word * @return */ bool Stemmer::doubleCon ( string word ) { unsigned long endWord = word.size( ) - 1; if ( word.size( ) > 2 && word[ endWord ] == word[ endWord - 1 ] ) { char doubleConsonant = word[ endWord ]; if ( doubleConsonant == 'l' || doubleConsonant == 's' || doubleConsonant == 'z' ) { return false; } else { return true; } } return false; } /** * Returns true if a word ends in a * Consonant, Vowel, Consonant pattern * Except when second C is W, X, or Y * * @param word * @return */ bool Stemmer::endCVC ( std::string word ) { unsigned long endWord = word.size( ) - 1; if ( word.size( ) > 2 ) { // the stem ends cvc if ( isConsonant( endWord, word ) && !isConsonant( endWord - 1, word ) && isConsonant( endWord - 2, word ) ) { // the second c is not W, X or Y if ( word[ endWord - 1 ] != 'w' && word[ endWord - 1 ] != 'x' && word[ endWord - 1 ] != 'y' ) { return true; } } } return false; } /** * Stem plural words * * @param word * @return */ std::string Stemmer::step1a ( std::string word ) { // check S at end if ( word[ word.size( ) - 1 ] == 's' && word.size() != 1) { string wordStem = word; auto substrSSES = lastN( word, 4 ); auto substrIES = lastN( word, 3 ); auto substrSS = lastN( word, 2 ); // sses -> ss // caresses -> caress if ( substrSSES == "sses" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ies -> i // ponies -> poni else if ( substrIES == "ies" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ss -> ss // caress -> caress else if ( substrSS == "ss" ) { // do nothing } // s -> // cats -> cat else if ( word[ word.size( ) - 1 ] == 's' ) { wordStem = subStr( word, 0, word.size( ) - 1 ); } else { wordStem = word; } return wordStem; } return word; } /** * Stem ED and ING * * @param word * @return */ std::string Stemmer::step1b ( std::string word ) { unsigned long end = word.size( ) - 1; auto begPtr = word.begin( ); auto endPtr = begPtr + end; string wordStem( word.begin( ), word.end( ) ); auto lastThree = lastN( word, 3 ); auto lastTwo = lastN( word, 2 ); int m = measure( word ); // check EED at end and m > 0 // feed -> feed // agreed -> agree if ( m > 1 && lastThree == "eed" ) { wordStem = subStr( word, 0, word.size( ) - 1 ); } // check ED at end and preceeded by substr with vowel // plastered -> plaster // bled -> bled else if ( m > 1 && lastTwo == "ed" && isVowelPresent( 0, word.size( ) - 2, word ) ) { wordStem = subStr( word, 0, word.size( ) - 2 ); if ( addE( wordStem ) || ( m == 1 && endCVC( wordStem + 'e' ) ) ) { wordStem += 'e'; } else if ( doubleCon( wordStem ) ) { wordStem = subStr( word, 0, wordStem.size( ) - 1 ); } } // check ING at end and proceeded by substr with vowel // motoring -> motor // sing -> sing else if ( lastThree == "ing" && isVowelPresent( 0, word.size( ) - 3, word ) ) { wordStem = subStr( word, 0, word.size( ) - 3 ); auto a = addE( wordStem ); auto m = measure( wordStem ); auto e = endCVC( wordStem + 'e' ); if ( addE( wordStem ) || ( measure( wordStem ) == 1 && endCVC( wordStem ) ) ) { wordStem += 'e'; } else if ( doubleCon( wordStem ) ) { wordStem = subStr( word, 0, wordStem.size( ) - 1 ); } } return wordStem; } /** * Checks for Y -> I * @param word * @return */ string Stemmer::step1c ( string word ) { // Y -> I // happy -> happi // sky -> sky if ( word[ word.size( ) - 1 ] == 'y' ) { if ( isVowelPresent( 0, word.size( ) - 1, word ) ) { word = subStr( word, 0, word.size( ) - 1 ); word += 'i'; } } return word; } /** * Step 2 * * @param word * @return */ string Stemmer::step2 ( std::string word ) { if ( measure( word ) == 0 ) { return word; } string wordStem = word; string lastSeven = lastN( word, 7 ); string lastSix = lastN( word, 6 ); string lastFive = lastN( word, 5 ); string lastFour = lastN( word, 4 ); string lastThree = lastN( word, 3 ); string lastTwo = lastN( word, 2 ); // ATIONAL -> ATE // relational -> relate if ( lastSeven == "ational" && word.size( ) != 8 ) { wordStem = subStr( word, 0, word.size( ) - 5 ); wordStem += 'e'; } // TIONAL -> TION // conditional -> condition // rational -> rational else if ( lastSix == "tional" && measure( subStr( word, 0, word.size( ) - 6 ) ) > 0 ) { wordStem = subStr( word, 0, word.size( ) - 5 ); } // ENCI -> ENCE // valenci -> valence else if ( lastFour == "enci" ) { wordStem = subStr( word, 0, word.size( ) - 1 ); wordStem += 'e'; } // ANCI -> ANCE // hesitanci -> hesitance else if ( lastFour == "anci" ) { wordStem = subStr( word, 0, word.size( ) - 1 ); wordStem += 'e'; } // IZER -> IZE // digitizer -> digitize else if ( lastFour == "izer" ) { wordStem = subStr( word, 0, word.size( ) - 1 ); } // ABLI -> ABLE // conformabli -> comformable else if ( lastFour == "abli" ) { wordStem = subStr( word, 0, word.size( ) - 1 ); wordStem += 'e'; } // ALLI -> AL // radicalli -> radical else if ( lastFour == "alli" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ENTLI -> ENT // differentli -> different else if ( lastFive == "entli" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ELI -> E // vileli -> vile else if ( lastThree == "eli" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // OUSLI -> OUS // analogousli -> analogous else if ( lastFive == "ousli" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // IZATION -> IZE // vietnamization -> vietnamize else if ( lastSeven == "ization" ) { wordStem = subStr( word, 0, word.size( ) - 5 ); wordStem += 'e'; } // ATION -> ATE // predication -> predicate else if ( lastFive == "ation" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); wordStem += 'e'; } // ATOR -> ATE // predication -> predicate else if ( lastFour == "ator" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); wordStem += 'e'; } // ALISM -> AL // feudalism -> feudal else if ( lastFive == "alism" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // IVENESS -> IVE // decisivenss -> decisive else if ( lastSeven == "iveness" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // FULNESS -> FUL // hopefulness -> hopeful else if ( lastSeven == "fulness" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // OUSNESS -> OUS // callousness -> callous else if ( lastSeven == "ousness" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // ALITI -> AL // formalit -> formal else if ( lastFive == "aliti" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // IVITI -> IVE // sensitiviti -> sensitive else if ( lastFive == "iviti" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); wordStem += 'e'; } // BILITI -> BLE // sensibiliti -> sensible else if ( lastSix == "biliti" ) { wordStem = subStr( word, 0, word.size( ) - 5 ); wordStem += "le"; } return wordStem; } /** * Step 3 * * @param word * @return */ std::string Stemmer::step3 ( std::string word ) { if ( measure( word ) == 0 ) { return word; } string wordStem = word; string lastFive = lastN( word, 5 ); string lastFour = lastN( word, 4 ); string lastThree = lastN( word, 3 ); // ICATE -> IC // triplicate -> triplic if ( lastFive == "icate" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // ATIVE -> // formative -> form else if ( lastFive == "ative" ) { wordStem = subStr( word, 0, word.size( ) - 5 ); } // ALIZE -> AL // formalize -> formal else if ( lastFive == "alize" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // ICITI -> IC // electriciti -> electric else if ( lastFive == "iciti" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // ICAL -> IC // electrical -> electric else if ( lastFour == "ical" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // FUL -> // hopeful -> hope else if ( lastThree == "ful" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // NESS -> // goodness -> good else if ( lastFour == "ness" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } return wordStem; } /** * Step 4 * * @param word * @return */ std::string Stemmer::step4 ( std::string word ) { if ( measure( word ) <= 2 ) { return word; } string wordStem = word; string lastFive = lastN( word, 5 ); string lastFour = lastN( word, 4 ); string lastThree = lastN( word, 3 ); string lastTwo = lastN( word, 2 ); // AL -> // revival -> reviv if ( lastTwo == "al" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ANCE -> // allowance -> allow else if ( lastFour == "ance" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // ENCE -> // inference -> infer else if ( lastFour == "ence" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // ER -> // airliner -> airlin else if ( lastTwo == "er" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // IC -> // gyroscopic -> gyroscope else if ( lastTwo == "ic" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ABLE -> // adjustable -> adjust else if ( lastFour == "able" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // IBLE -> // goodness -> good else if ( lastFour == "ible" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // ANT -> // irritant -> irrit else if ( lastThree == "ant" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // EMENT -> // replacement -> replace else if ( lastFive == "ement" ) { wordStem = subStr( word, 0, word.size( ) - 5 ); } // MENT -> // adjustment -> adjust else if ( lastFour == "ment" ) { wordStem = subStr( word, 0, word.size( ) - 4 ); } // ENT -> // dependent -> depend else if ( lastThree == "ent" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // ION -> // stem must end in 't' or 's' // adoption -> adopt else if ( lastThree == "ion" && ( lastFour == "sion" || lastFour == "tion" ) ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // OU -> // homologou -> homolog else if ( lastTwo == "ou" ) { wordStem = subStr( word, 0, word.size( ) - 2 ); } // ISM -> // communism -> commun else if ( lastThree == "ism" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // ATE -> // activate -> activ else if ( lastThree == "ate" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // ITI -> // angulariti -> angular else if ( lastThree == "iti" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // OUS -> // homologous -> homolog else if ( lastThree == "ous" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // IVE -> // effective -> effect else if ( lastThree == "ive" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } // IZE -> // bowdlerize -> bowdler else if ( lastThree == "ize" ) { wordStem = subStr( word, 0, word.size( ) - 3 ); } return wordStem; } /** * Step 5a * * @param word * @return */ std::string Stemmer::step5a ( std::string word ) { auto m = measure( word ); // E -> // probabte -> probat if ( m > 1 && word[ word.size( ) - 1 ] == 'e' ) { word = subStr( word, 0, word.size( ) - 1 ); return word; } // E -> // cease -> cease string wordStem = subStr( word, 0, word.size( ) - 1 ); if ( m == 1 && !endCVC( wordStem ) && word[ word.size( ) - 1 ] == 'e' ) { word = subStr( word, 0, word.size( ) - 1 ); } return word; } /** * Step 5b * * @param word * @return */ std::string Stemmer::step5b ( std::string word ) { if ( word.size( ) > 2 && measure( word ) > 1 && word[ word.size( ) - 1 ] == 'l' && word[ word.size( ) - 2 ] == 'l' ) { word = subStr( word, 0, word.size( ) - 1 ); } return word; }